# City Matching Analysis

This notebook loads the matching candidates data and joins the city information from the vehicle events.

In [6]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from noise_detection.non_ride_detection import filter_known_issues

## Load Data

In [7]:
# Load matching candidates
matching_df = pd.read_parquet('../matching/matching_candidates_scored.parquet')
print(f"Matching candidates shape: {matching_df.shape}")
matching_df.head()

Matching candidates shape: (45366529, 26)


Unnamed: 0,d_idx,f_idx,provider,vehicle_type_id,d_lat,d_lon,f_lat,f_lon,d_time,f_time,...,speed,log_p_distance,log_p_speed,log_p_range,score,haversine_km,prob,prob_null,prob_forward,prob_backward
0,171071,173445,bolt_basel,BLT:VehicleType:e92dc79f-736d-5eca-9ff4-180665...,47.559498,7.598561,47.537254,7.570603,2025-10-26 14:10:06.891088,2025-10-26 14:27:06.883382,...,12.988334,-2.502355,-2.203296,-1.685448,-6.391099,,0.9997973,0.000203,0.9997973,0.9154952
1,171713,177179,bolt_basel,BLT:VehicleType:e92dc79f-736d-5eca-9ff4-180665...,47.578182,7.589325,47.513931,7.620455,2025-10-26 14:15:06.910183,2025-10-26 14:52:06.898922,...,13.783854,-4.937281,-2.163429,-5.086204,-12.186913,,0.0009350702,0.001164,0.0009350702,0.04159073
2,171713,173443,bolt_basel,BLT:VehicleType:e92dc79f-736d-5eca-9ff4-180665...,47.578182,7.589325,47.563847,7.591785,2025-10-26 14:15:06.910183,2025-10-26 14:27:06.883382,...,10.600394,-1.496594,-2.633249,-32.34444,-36.474281,,6.174465999999999e-19,0.001164,6.174465999999999e-19,3.588094e-07
3,171713,180306,bolt_basel,BLT:VehicleType:e92dc79f-736d-5eca-9ff4-180665...,47.578182,7.589325,47.552078,7.621399,2025-10-26 14:15:06.910183,2025-10-26 15:13:06.871626,...,5.493165,-3.434094,-5.115246,-13.601135,-22.150475,,7.360793e-10,0.001164,7.360793e-10,0.0004914604
4,171713,174099,bolt_basel,BLT:VehicleType:e92dc79f-736d-5eca-9ff4-180665...,47.578182,7.589325,47.563206,7.588843,2025-10-26 14:15:06.910183,2025-10-26 14:32:06.902328,...,6.564756,-1.32773,-4.417964,-3.411419,-9.157114,,0.4014666,0.001164,0.4014666,0.9999986


In [8]:
# Load vehicle events (only city column needed, use index for joining)
events_df = pd.read_parquet('../vehicle_events_export.parquet', columns=['city'])
events_df['idx'] = events_df.index
print(f"Vehicle events shape: {events_df.shape}")
events_df.head()

Vehicle events shape: (16476198, 2)


Unnamed: 0,city,idx
0,Stuttgart,0
1,Stuttgart,1
2,Stuttgart,2
3,Stuttgart,3
4,Stuttgart,4


## Join City to Matching Data

Join based on `d_idx` (matching) = `idx` (events index)

In [9]:
# Join city column to matching data based on d_idx = idx (event index)
matching_with_city = matching_df.merge(
    events_df,
    left_on='d_idx',
    right_on='idx',
    how='left'
).drop(columns=['idx'])

print(f"Matching with city shape: {matching_with_city.shape}")
print(f"\nCity distribution:")
print(matching_with_city['city'].value_counts(dropna= False))

Matching with city shape: (45366529, 27)

City distribution:
city
Stuttgart                 17594820
Zürich                     8493730
Karlsruhe                  8132132
Saarbrücken                4034049
Mannheim                   2507886
Reutlingen/Tübingen         782223
Heilbronn                   767975
Basel                       556337
Bern                        541625
Ulm                         490300
Mainz                       450629
Kaiserslautern              269714
Pforzheim                   228702
Heidelberg                  144528
St. Gallen                  118575
Friedrichshafen              81403
Villingen-Schwenningen       73159
Freiburg                     54428
Konstanz                     16497
Winterthur                   10029
Überlingen                    8237
Lindau                        6533
Bregenz                       3018
Name: count, dtype: int64


In [10]:
# Filter out known data quality issues (Saarbrücken 3am spike)
matching_filtered = filter_known_issues(matching_with_city)
print(f"\nFiltered city distribution:")
print(matching_filtered['city'].value_counts(dropna=False))

Filtered out 359,950 trips (0.8%):
  - Saarbrücken 3am: 359,950

Filtered city distribution:
city
Stuttgart                 17594820
Zürich                     8493730
Karlsruhe                  8132132
Saarbrücken                3674099
Mannheim                   2507886
Reutlingen/Tübingen         782223
Heilbronn                   767975
Basel                       556337
Bern                        541625
Ulm                         490300
Mainz                       450629
Kaiserslautern              269714
Pforzheim                   228702
Heidelberg                  144528
St. Gallen                  118575
Friedrichshafen              81403
Villingen-Schwenningen       73159
Freiburg                     54428
Konstanz                     16497
Winterthur                   10029
Überlingen                    8237
Lindau                        6533
Bregenz                       3018
Name: count, dtype: int64


In [11]:
matching_filtered[matching_filtered["city"] == "Lindau"]

Unnamed: 0,d_idx,f_idx,provider,vehicle_type_id,d_lat,d_lon,f_lat,f_lon,d_time,f_time,...,log_p_distance,log_p_speed,log_p_range,score,haversine_km,prob,prob_null,prob_forward,prob_backward,city
11086783,173478,174204,dott_lindau,DOE:VehicleType:dott_scooter,47.558609,9.700511,47.554054,9.707353,2025-10-26 14:28:06.893300,2025-10-26 14:33:06.895453,...,-0.900889,-2.255835,-1.204936,-4.361660,,9.999760e-01,0.000024,9.999760e-01,1.000000,Lindau
11086784,193246,194262,dott_lindau,DOE:VehicleType:dott_scooter,47.559006,9.708863,47.549881,9.715554,2025-10-26 16:41:06.873303,2025-10-26 16:47:06.880593,...,-1.113940,-2.210882,-1.762745,-5.087567,,9.999504e-01,0.000050,9.999504e-01,1.000000,Lindau
11086785,198450,201675,dott_lindau,DOE:VehicleType:dott_scooter,47.545464,9.681171,47.542492,9.727479,2025-10-26 17:17:01.545932,2025-10-26 17:38:06.892348,...,-3.195367,-2.162374,-3.466829,-8.824571,,9.979244e-01,0.002076,9.979244e-01,1.000000,Lindau
11086786,200463,203503,dott_lindau,DOE:VehicleType:dott_scooter,47.554203,9.697893,47.549873,9.715553,2025-10-26 17:31:06.963602,2025-10-26 17:51:06.885888,...,-1.353370,-4.973108,-1.432496,-7.758975,,4.510660e-02,0.000086,4.510660e-02,0.375679,Lindau
11086787,200463,201676,dott_lindau,DOE:VehicleType:dott_scooter,47.554203,9.697893,47.557240,9.717332,2025-10-26 17:31:06.963602,2025-10-26 17:38:06.892348,...,-1.379161,-2.444144,-1.857605,-5.680910,,9.548078e-01,0.000086,9.548078e-01,1.000000,Lindau
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11093311,16383288,16383983,dott_lindau,DOE:VehicleType:dott_scooter,47.546135,9.678994,47.550823,9.691631,2026-01-17 15:46:06.574474,2026-01-17 15:50:06.567231,...,-1.069295,-4.427324,-2.707502,-8.204121,,9.988829e-01,0.001117,9.988829e-01,1.000000,Lindau
11093312,16428829,16430010,dott_lindau,DOE:VehicleType:dott_scooter,47.558662,9.700469,47.547695,9.689380,2026-01-17 20:07:06.559873,2026-01-17 20:13:06.570730,...,-1.258187,-2.661371,-1.622798,-5.542356,,9.999219e-01,0.000078,9.999219e-01,1.000000,Lindau
11093313,16452850,16453809,dott_lindau,DOE:VehicleType:dott_scooter,47.556210,9.689784,47.558994,9.708833,2026-01-17 22:43:06.626858,2026-01-17 22:49:06.582784,...,-1.295926,-2.848386,-2.787659,-6.931971,,9.996867e-01,0.000313,9.996867e-01,1.000000,Lindau
11093314,16463636,16464589,dott_lindau,DOE:VehicleType:dott_scooter,47.547752,9.689366,47.560474,9.695858,2026-01-18 00:01:06.589325,2026-01-18 00:07:06.585640,...,-1.270711,-2.720352,-1.988464,-5.979527,,9.998791e-01,0.000121,9.998791e-01,1.000000,Lindau
