In [133]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [134]:
TRANSPORT_DATA_V2_PATH = "../../data/processed/transports_v2.parquet"

transport_data = pd.read_parquet(TRANSPORT_DATA_V2_PATH)
transport_data.head()

Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries
0,80:06____:17171:000,Train,RB,RB,8500090,14:50:00,,293.939394,,120.0,,388.229414,,68,0,4,0,0,104
1,80:06____:17261:000,Train,RB,RB,8500090,,15:53:00,,61.621622,,0.0,,129.218022,0,9,1,0,0,104
2,80:800693:3053:000,Train,IRE3,IRE,8503424,11:58:00,12:00:00,151.539474,127.605263,41.0,19.0,627.797068,622.499501,60,73,2,0,0,78
4,80:sbg034:14004,Bus,Bus7349,B,8573327,09:07:00,,2.4,,0.0,,29.44332,,5,0,0,0,0,100
6,80:sbg034:55413,Bus,Bus7349,B,8503474,00:19:00,00:20:00,138.0,96.0,180.0,120.0,107.02591,84.852814,70,68,0,0,0,100


In [135]:
train_data = transport_data[transport_data['product_id'] == 'Train'].copy()
print(f"Train entries represent {train_data.shape[0] / transport_data.shape[0]:.2%} of the total entries.")
print(f"Trains have {train_data.shape[0]} entries.")

Train entries represent 6.33% of the total entries.
Trains have 349927 entries.


In [136]:
# Save the train data
TRAINS_DATA_V2_PATH = "../../data/processed/trains_v2.parquet"
train_data.to_parquet(TRAINS_DATA_V2_PATH)

## Example

In [137]:
# Add time column to sort
train_data['time'] = train_data['arrival_time'].fillna(train_data['departure_time'])

In [140]:
s9_data = train_data[train_data['line_text'] == 'S9']
print(f"There are {s9_data.shape[0]:,} S9 entries.")
s9_data.head()

There are 11,222 S9 entries.


Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries,time
160,85:11:17909:001,Train,S9,S,8500218,08:53:00,,28.037037,,17.0,,67.089315,,88,0,13,0,0,148,08:53:00
161,85:11:17911:001,Train,S9,S,8500218,09:53:00,,35.948148,,25.0,,64.596603,,94,0,13,0,0,148,09:53:00
162,85:11:17918:001,Train,S9,S,8500031,13:19:00,13:19:00,85.185185,99.896296,72.0,87.0,66.813193,72.231828,135,135,13,0,0,148,13:19:00
163,85:11:17934:001,Train,S9,S,8500034,19:48:00,19:48:00,76.145631,100.116505,54.0,66.0,57.849434,51.423711,103,103,0,0,0,104,19:48:00
164,85:11:17935:001,Train,S9,S,8500034,21:14:00,21:14:00,80.333333,104.97037,36.0,76.0,88.145826,88.638789,134,134,13,0,0,148,21:14:00


In [141]:
s9_data_stop_ids = s9_data['stop_id'].unique()
print(f"There are {len(s9_data_stop_ids):,} unique stop ids for the S9 line.")

There are 88 unique stop ids for the S9 line.


In [142]:
stops = pd.read_csv("../../data/processed/stops.csv")
s9_data = s9_data.merge(stops, on='stop_id')
s9_data_stop_names = s9_data['stop_name'].unique()
print(f"The stops for the S9 line are: {', '.join(s9_data_stop_names)}.")

The stops for the S9 line are: Olten, Diepflingen, Buckten, Neuhausen Rheinfall, Neuhausen, Uster, Eglisau, Zürich Stadelhofen, Schwerzenbach ZH, Zürich Hardbrücke, Rafz, Stettbach, Jestetten, Lottstetten, Oberglatt ZH, Schaffhausen, Bülach, Glattbrugg, Seon, Baldegg, Hallwil, Ballwil, Waldibrücke, Hochdorf, Lenzburg, Beinwil am See, Mosen, Galmiz, Murten/Morat, Avenches, Palézieux, Corcelles-Nord, Moudon, Henniez, Domdidier, Ecublens-Rue, Lichtensteig, Dietfurt, Bütschwil, Wattwil, Bazenheid, Unterzollikofen, Lucens, Trimbach, Läufelfingen, Sissach, Nänikon-Greifensee, Niederglatt ZH, Hüntwangen-Wil, Luzern, Boniswil, Ermensee, Baldegg Kloster, Gelfingen, Birrwil, Emmenbrücke Gersag, Hochdorf Schönau, Faoug, Muntelier-Löwenberg, Lütisburg, Bern, Bern Felsenau, Rümlingen, Sommerau, Glattfelden, Rümlang, Zürich Oerlikon, Hitzkirch, Granges-Marnand, Dompierre FR, Puidoux, Kerzers, Wil SG, Worblaufen, Steinibach, Châtillens, Eschenbach LU, Payerne, Lausanne, Zürich HB, Bern Tiefenau, Dübe

In [145]:
trip_stops = s9_data \
    .sort_values(['trip_id', 'time']) \
    .groupby('trip_id') \
    .agg({
        'stop_id': lambda x: list(x),
        'mean_arrival_delay': lambda x: list(x),
        'arrival_time': lambda x: list(x),
    }) \
    .rename(columns={
        'stop_id': 'stop_ids', 
        'mean_arrival_delay': 'mean_arrival_delays', 
        'arrival_time': 'arrival_times'
    }) \
    .reset_index()

trip_stops.head()

Unnamed: 0,trip_id,stop_ids,mean_arrival_delays,arrival_times
0,85:11:14089:011,"[8501120, 8504000, 8504010, 8504011, 8504012, ...","[nan, 89.8, 55.75, 115.925, 102.325, 157.225, ...","[None, 00:06:00, 00:09:00, 00:12:00, 00:16:00,..."
1,85:11:14229:005,"[8501120, 8504000, 8504010, 8504011, 8504012, ...","[nan, 121.20833333333333, 83.89583333333333, 1...","[None, 12:06:00, 12:09:00, 12:12:00, 12:16:00,..."
2,85:11:14235:006,"[8501120, 8504000, 8504010, 8504011, 8504012, ...","[nan, 84.28571428571429, 46.183673469387756, 1...","[None, 13:06:00, 13:09:00, 13:12:00, 13:16:00,..."
3,85:11:14236:004,"[8504400, 8504129, 8504140, 8504128, 8504127, ...","[nan, 70.53061224489795, 50.775510204081634, 3...","[None, 12:07:00, 12:10:00, 12:13:00, 12:24:00,..."
4,85:11:14239:006,"[8501120, 8504000, 8504010, 8504011, 8504012, ...","[nan, 97.29166666666667, 55.895833333333336, 1...","[None, 14:06:00, 14:09:00, 14:12:00, 14:16:00,..."


In [146]:
# Print every trips with the stop name and time
for i, row in trip_stops.iterrows():
    print(f"Trip {i}:")
    for stop_id, arrival_time in zip(row['stop_ids'], row['arrival_times']):
        stop_name = stops[stops['stop_id'] == stop_id]['stop_name'].values[0]
        print(f"\t{stop_name}: {arrival_time}")
    print()

Trip 0:
	Lausanne: None
	Pully-Nord: 00:06:00
	La Conversion: 00:09:00
	Grandvaux: 00:12:00
	Puidoux: 00:16:00
	Moreillon: 00:18:00
	Palézieux: 00:24:00
	Palézieux-Village: 00:33:00
	Châtillens: 00:36:00
	Ecublens-Rue: 00:42:00
	Moudon: 00:51:00
	Lucens: 00:57:00
	Henniez: 01:02:00
	Granges-Marnand: 01:05:00
	Payerne: 01:13:00
	Corcelles-Nord: 01:20:00
	Dompierre FR: 01:23:00
	Domdidier: 01:25:00
	Avenches: 01:29:00
	Faoug: 01:33:00
	Murten/Morat: 01:38:00
	Muntelier-Löwenberg: 01:49:00
	Galmiz: 01:51:00
	Kerzers: 01:56:00

Trip 1:
	Lausanne: None
	Pully-Nord: 12:06:00
	La Conversion: 12:09:00
	Grandvaux: 12:12:00
	Puidoux: 12:16:00
	Moreillon: 12:18:00
	Palézieux: 12:24:00
	Palézieux-Village: 12:33:00
	Châtillens: 12:36:00
	Ecublens-Rue: 12:42:00
	Moudon: 12:51:00
	Lucens: 12:57:00
	Henniez: 13:02:00
	Granges-Marnand: 13:05:00
	Payerne: 13:13:00
	Corcelles-Nord: 13:20:00
	Dompierre FR: 13:23:00
	Domdidier: 13:25:00
	Avenches: 13:29:00
	Faoug: 13:33:00
	Murten/Morat: 13:38:00
	Muntelie