In [None]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path().resolve().parent))

from matching_pipeline import run_pipeline, load_all_candidates

summary = run_pipeline(events_path="../vehicle_events_export.parquet")

In [None]:
import pandas as pd
from pathlib import Path
import psutil
import os

def optimize_dtypes(df):
    df['d_idx'] = df['d_idx'].astype('int32')
    df['f_idx'] = df['f_idx'].astype('int32')
    
    float_cols = ['d_lat', 'd_lon', 'f_lat', 'f_lon', 'd_range_km', 'f_range_km',
                  'range_consumed', 'delta_t_hours', 'opt_route_km', 'opt_route_min',
                  'speed', 'log_p_distance', 'log_p_speed', 'log_p_range', 'score']
    for col in float_cols:
        if col in df.columns:
            df[col] = df[col].astype('float32')
    
    df['provider'] = df['provider'].astype('category')
    df['vehicle_type_id'] = df['vehicle_type_id'].astype('category')
    return df

files = list(Path("matching_output/candidates").glob("*_candidates.parquet"))

dfs = []
for file in files:
    df = optimize_dtypes(pd.read_parquet(file))
    dfs.append(df)

all_candidates = pd.concat(dfs, ignore_index=True)

In [None]:
import numpy as np

def bidirectional_assignment_fast(candidates, n_iterations=3, forward_temp=1.0, backward_temp=3.0, null_score=-15):
    candidates = candidates.copy()
    scores = candidates['score'].values.copy()
    
    d_groups = candidates.groupby('d_idx').ngroup().values
    f_groups = candidates.groupby('f_idx').ngroup().values
    n_d_groups = d_groups.max() + 1
    n_f_groups = f_groups.max() + 1
    
    score_adj = scores.copy()
    
    for iteration in range(n_iterations):
        s_fwd = score_adj / forward_temp
        max_d = np.full(n_d_groups, -np.inf)
        np.maximum.at(max_d, d_groups, s_fwd)
        max_d = np.maximum(max_d, null_score / forward_temp)
        
        exp_s = np.exp(s_fwd - max_d[d_groups])
        exp_null = np.exp(null_score / forward_temp - max_d)
        
        sum_d = np.zeros(n_d_groups)
        np.add.at(sum_d, d_groups, exp_s)
        sum_d += exp_null
        
        prob_forward = exp_s / sum_d[d_groups]
        prob_null = exp_null / sum_d
        
        s_back = score_adj / backward_temp
        max_f = np.full(n_f_groups, -np.inf)
        np.maximum.at(max_f, f_groups, s_back)
        
        exp_s_back = np.exp(s_back - max_f[f_groups])
        sum_f = np.zeros(n_f_groups)
        np.add.at(sum_f, f_groups, exp_s_back)
        
        prob_backward = exp_s_back / sum_f[f_groups]
        
        if iteration < n_iterations - 1:
            penalty = -np.log(np.maximum(prob_backward, 1e-6))
            score_adj = scores - penalty
    
    candidates['prob_forward'] = prob_forward
    candidates['prob_backward'] = prob_backward
    candidates['prob_null'] = prob_null[d_groups]
    candidates['prob'] = prob_forward
    
    return candidates

In [None]:
df = pd.read_parquet("../events_with_flags.parquet")

NON_COMPLIANT_PROVIDERS = ['lime_zurich', 'lime_stuttgart', 'lime_basel', 'lime_uster', 'lime_opfikon']

temp_d_idx = df[
    (df['is_temporary_disappearance'] == True) & 
    (~df['provider'].isin(NON_COMPLIANT_PROVIDERS))
].index

filtered = all_candidates[~all_candidates['d_idx'].isin(temp_d_idx)]

result = bidirectional_assignment_fast(filtered, n_iterations=3, forward_temp=1.0, backward_temp=3.0)

best_matches = result.loc[result.groupby('d_idx')['prob'].idxmax()]

In [4]:
matches = result.to_parquet("matching_candidates_scored.parquet")


In [None]:
matches = pd.read_parquet("matching_output/matching_candidates_scored.parquet")
df = pd.read_parquet("../vehicle_events_export.parquet")

total_disappearances = df['disappeared'].sum()

excluded = df['is_maintenance'] | df['is_id_reset'] | df['is_temporary_disappearance']
n_excluded = (df['disappeared'] & excluded).sum()
real_disappearances = (df['disappeared'] & ~excluded).sum()

n_with_candidates = matches['d_idx'].nunique()

best_matches = matches.loc[matches.groupby('d_idx')['prob'].idxmax()]
high_conf_matches = best_matches[best_matches['prob'] > 0.5]

In [None]:
import pandas as pd

df = pd.read_parquet("../events_with_flags.parquet")
matches = pd.read_parquet("matching_output/matching_candidates_scored.parquet")

excluded = df['is_maintenance'] | df['is_id_reset'] | df['is_temporary_disappearance']
real_disappeared = df[df['disappeared'] & ~excluded].copy()

matched_d_idx = set(matches['d_idx'].unique())
real_disappeared['has_candidate'] = real_disappeared.index.isin(matched_d_idx)

unmatched = real_disappeared[~real_disappeared['has_candidate']]
matched = real_disappeared[real_disappeared['has_candidate']]

provider_stats = real_disappeared.groupby('provider').agg(
    total=('has_candidate', 'count'),
    matched=('has_candidate', 'sum')
)
provider_stats['unmatched'] = provider_stats['total'] - provider_stats['matched']
provider_stats['match_rate'] = 100 * provider_stats['matched'] / provider_stats['total']

no_range = unmatched['current_range_meters'].isna().sum()

if 'current_range_meters' in unmatched.columns:
    low_range = (unmatched['current_range_meters'] < 300).sum()

unmatched['hour'] = pd.to_datetime(unmatched['timestamp']).dt.hour
matched['hour'] = pd.to_datetime(matched['timestamp']).dt.hour

In [None]:
from sklearn.neighbors import BallTree
import numpy as np

first_seen = df[df['first_seen'] & ~excluded].copy()
first_seen = first_seen[first_seen['current_range_meters'].notna()]

sample_size = min(10000, len(unmatched))
unmatched_sample = unmatched.sample(sample_size, random_state=42)

MAX_TIME_HOURS = 1.0
MAX_DISTANCE_KM = 8.0
MIN_RANGE_DRAIN_KM = 0.3

results = {
    'no_fs_same_provider_vtype': 0,
    'no_fs_in_time_window': 0,
    'no_fs_in_distance': 0,
    'no_fs_with_range_drain': 0,
    'has_potential_match': 0,
}

for provider in unmatched_sample['provider'].unique():
    for vtype in unmatched_sample[unmatched_sample['provider'] == provider]['vehicle_type_id'].unique():
        d_subset = unmatched_sample[
            (unmatched_sample['provider'] == provider) & 
            (unmatched_sample['vehicle_type_id'] == vtype)
        ]
        
        fs_subset = first_seen[
            (first_seen['provider'] == provider) & 
            (first_seen['vehicle_type_id'] == vtype)
        ]
        
        if len(fs_subset) == 0:
            results['no_fs_same_provider_vtype'] += len(d_subset)
            continue
        
        fs_coords = np.radians(fs_subset[['lat', 'lon']].values)
        tree = BallTree(fs_coords, metric='haversine')
        
        d_coords = np.radians(d_subset[['lat', 'lon']].values)
        d_times = pd.to_datetime(d_subset['timestamp']).values
        d_ranges = d_subset['current_range_meters'].values / 1000.0
        
        fs_times = pd.to_datetime(fs_subset['timestamp']).values
        fs_ranges = fs_subset['current_range_meters'].values / 1000.0
        
        max_dist_rad = MAX_DISTANCE_KM / 6371.0
        
        for i in range(len(d_subset)):
            d_time = d_times[i]
            d_range = d_ranges[i]
            
            neighbors = tree.query_radius([d_coords[i]], r=max_dist_rad)[0]
            
            if len(neighbors) == 0:
                time_mask = (fs_times > d_time) & (fs_times <= d_time + np.timedelta64(int(MAX_TIME_HOURS * 3600), 's'))
                if time_mask.sum() == 0:
                    results['no_fs_in_time_window'] += 1
                else:
                    results['no_fs_in_distance'] += 1
                continue
            
            found_match = False
            for j in neighbors:
                f_time = fs_times[j]
                f_range = fs_ranges[j]
                
                delta_t = (f_time - d_time) / np.timedelta64(1, 'h')
                if delta_t <= 0 or delta_t > MAX_TIME_HOURS:
                    continue
                
                range_consumed = d_range - f_range
                if range_consumed >= MIN_RANGE_DRAIN_KM:
                    found_match = True
                    break
            
            if found_match:
                results['has_potential_match'] += 1
            else:
                results['no_fs_with_range_drain'] += 1