In [279]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [280]:
TRANSPORT_DATA_V2_PATH = "../../data/processed/transports_v2.parquet"

transport_data = pd.read_parquet(TRANSPORT_DATA_V2_PATH)
transport_data.head()

Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries
0,80:06____:17171:000,Train,RB,RB,8500090,14:50:00,,293.939394,,120.0,,388.229414,,68,0,4,0,0,104
1,80:06____:17261:000,Train,RB,RB,8500090,,15:53:00,,61.621622,,0.0,,129.218022,0,9,1,0,0,104
2,80:800693:3053:000,Train,IRE3,IRE,8503424,11:58:00,12:00:00,151.539474,127.605263,41.0,19.0,627.797068,622.499501,60,73,2,0,0,78
4,80:sbg034:14004,Bus,Bus7349,B,8573327,09:07:00,,2.4,,0.0,,29.44332,,5,0,0,0,0,100
6,80:sbg034:55413,Bus,Bus7349,B,8503474,00:19:00,00:20:00,138.0,96.0,180.0,120.0,107.02591,84.852814,70,68,0,0,0,100


In [281]:
print(f"Number of entries: {transport_data.shape[0]}")

Number of entries: 5527916


In [282]:
def remove_duplicates_without_line_text(data):
    new_data = data.copy()
    new_data = new_data.drop_duplicates(subset=['trip_id', 'product_id', 'transport_type', 'stop_id', 'arrival_time', 'departure_time'], keep="first")
    return new_data

transport_data = remove_duplicates_without_line_text(transport_data)
print(f"Number of entries: {transport_data.shape[0]}")

Number of entries: 5526137


In [283]:
# Add time column to sort
transport_data['time'] = transport_data['arrival_time'].fillna(transport_data['departure_time'])

In [243]:
train_data = transport_data[transport_data['product_id'] == 'Train'].copy()
print(f"Train entries represent {train_data.shape[0] / transport_data.shape[0]:.2%} of the total entries.")
print(f"Number of entries: {train_data.shape[0]}")

Train entries represent 6.30% of the total entries.
Number of entries: 348148


In [244]:
# Save the train data
TRAINS_DATA_V2_PATH = "../../data/processed/trains_v2.parquet"
train_data.to_parquet(TRAINS_DATA_V2_PATH)

In [285]:
train_data.sort_values(['trip_id', 'time']).head()

Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries,time
50014196,80:06____:17001:000,Train,RE7,RE,8500090,07:29:00,,177.831325,,120.0,,212.120813,,74,0,3,0,0,86,07:29:00
17108720,80:06____:17001:000,Train,RE7,RE,8500090,08:29:00,,182.608696,,60.0,,238.778229,,115,0,4,0,0,142,08:29:00
14475623,80:06____:17003:000,Train,RE7,RE,8500090,09:03:00,,140.526316,,0.0,,263.554648,,34,0,3,0,0,79,09:03:00
59228240,80:06____:17003:000,Train,RE7,RE,8500090,10:03:00,,196.438356,,60.0,,413.927763,,76,0,2,1,0,148,10:03:00
48698740,80:06____:17005:000,Train,RE7,RE,8500090,10:29:00,,308.93617,,120.0,,620.798078,,83,0,2,0,0,96,10:29:00


In [286]:
def remove_single_stop_journeys(data):
    """
    Remove journeys that only have the same stop id in each of its entries.
    """
    new_data = data.copy()
    # Group by 'trip_id' and check if all 'stop_id' values are the same
    grouped_data = new_data.groupby('trip_id').agg({
        'stop_id': lambda x: len(x.unique()) == 1
    }).reset_index().rename(columns={'stop_id': 'single_stop'})

    # Merge the grouped_data with the original DataFrame
    data_with_single_stop = new_data.merge(grouped_data, on='trip_id')

    # Filter out the rows with single_stop == True
    filtered_data = data_with_single_stop[~data_with_single_stop['single_stop']].drop(columns=['single_stop'])

    return filtered_data

train_data = remove_single_stop_journeys(train_data)
print(f"Number of entries: {train_data.shape[0]}")
train_data.sort_values(['trip_id', 'time']).head()

Number of entries: 5524471


Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries,time
5517226,80:800693:3040:000,Train,IRE3,IRE,8503424,06:58:00,07:00:00,307.0,252.146667,101.0,39.0,644.312728,638.846885,75,75,1,0,0,76,06:58:00
5517228,80:800693:3040:000,Train,IRE3,IRE,8503424,07:58:00,08:00:00,158.090278,114.597222,92.0,38.0,356.211415,351.58839,142,144,4,0,0,148,07:58:00
5517225,80:800693:3040:000,Train,IRE3,IRE,8500090,08:15:00,,126.575342,,0.0,,437.32844,,17,0,2,0,0,76,08:15:00
5517227,80:800693:3040:000,Train,IRE3,IRE,8500090,09:15:00,,63.75,,0.0,,341.497883,,22,0,4,0,0,148,09:15:00
5203290,80:800693:3041:000,Train,RB,RB,8500090,,05:58:00,,,,,,,0,0,2,0,0,80,05:58:00


In [287]:
def enhance_trip_id(data):
    new_data = data.copy()

    # Sort data by trip_id, stop_id, and time
    new_data = new_data.sort_values(['trip_id', 'stop_id', 'time'])

    # Group data by trip_id and stop_id and assign an increasing number for each group
    new_data['group_number'] = new_data \
        .groupby(['trip_id', 'stop_id']) \
        .cumcount() + 1

    # Update the trip_id by appending the group_number
    new_data['trip_id'] = new_data['trip_id'].astype(str) + '_' + new_data['group_number'].astype(str)

    # Remove the group_number column
    new_data = new_data.drop(columns=['group_number'])

    return new_data

train_data = enhance_trip_id(train_data)
train_data.sort_values(['trip_id', 'time']).head()
print(f"Number of entries: {train_data.shape[0]}")

Number of entries: 5524471


In [288]:
def add_trip_index(data):
    new_data = data.copy()
    # Add a temporary unique identifier to the original data
    new_data['_temp_id'] = np.arange(len(data))
    
    trip_stops = new_data \
        .sort_values(['trip_id', 'time']) \
        .groupby('trip_id') \
        .agg({
            'stop_id': lambda x: list(x),
            '_temp_id': lambda x: list(x)
        }) \
        .rename(columns={
            'stop_id': 'stop_ids', 
        }) \
        .reset_index()

    # Add indexes
    trip_stops['stop_indexes'] = trip_stops['stop_ids'].apply(lambda x: list(range(len(x))))
    
    # Create a new DataFrame with stop_ids, temp_ids and their corresponding indexes
    exploded_trip_stops = pd.concat(
        [pd.DataFrame(
            {'trip_id': x['trip_id'], 'stop_id': x['stop_ids'], '_temp_id': x['_temp_id'], 'trip_index': list(range(len(x['stop_ids'])))}
        ) for i, x in trip_stops.iterrows()], ignore_index=True)
    
    # Merge with the original DataFrame (using the temporary unique identifier)
    result = new_data.merge(exploded_trip_stops, on=['trip_id', 'stop_id', '_temp_id'])

    # Remove the temporary unique identifier column
    result = result.drop(columns=['_temp_id'])

    return result

train_data = add_trip_index(train_data)
print(f"Number of entries: {train_data.shape[0]}")
train_data.sort_values(['trip_id', 'time']).head()

Number of entries: 5524471


Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,...,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries,time,trip_index
2,80:800693:3040:000_1,Train,IRE3,IRE,8503424,06:58:00,07:00:00,307.0,252.146667,101.0,...,644.312728,638.846885,75,75,1,0,0,76,06:58:00,0
0,80:800693:3040:000_1,Train,IRE3,IRE,8500090,08:15:00,,126.575342,,0.0,...,437.32844,,17,0,2,0,0,76,08:15:00,1
3,80:800693:3040:000_2,Train,IRE3,IRE,8503424,07:58:00,08:00:00,158.090278,114.597222,92.0,...,356.211415,351.58839,142,144,4,0,0,148,07:58:00,0
1,80:800693:3040:000_2,Train,IRE3,IRE,8500090,09:15:00,,63.75,,0.0,...,341.497883,,22,0,4,0,0,148,09:15:00,1
4,80:800693:3041:000_1,Train,RB,RB,8500090,,05:58:00,,,,...,,,0,0,2,0,0,80,05:58:00,0


## Example S9

In [249]:
s9_data = train_data[train_data['line_text'] == 'S9']
print(f"There are {s9_data.shape[0]:,} S9 entries.")
s9_data.sort_values(['trip_id', 'time']).head()

There are 11,222 S9 entries.


Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,...,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries,time,trip_index
1975,85:11:14089:011_1,Train,S9,S,8501120,,00:04:00,,32.375,,...,,36.022562,0,40,0,0,40,40,00:04:00,0
1976,85:11:14089:011_1,Train,S9,S,8504000,00:06:00,00:07:00,89.8,57.775,77.0,...,41.465587,44.396516,40,40,0,0,40,40,00:06:00,1
1978,85:11:14089:011_1,Train,S9,S,8504010,00:09:00,00:09:00,55.75,76.2,49.0,...,44.857467,48.447804,40,40,0,0,40,40,00:09:00,2
1979,85:11:14089:011_1,Train,S9,S,8504011,00:12:00,00:12:00,115.925,144.625,115.0,...,51.286719,54.62914,40,40,0,0,40,40,00:12:00,3
1980,85:11:14089:011_1,Train,S9,S,8504012,00:16:00,00:16:00,102.325,132.925,97.0,...,57.884229,60.299419,40,40,0,0,40,40,00:16:00,4


In [250]:
s9_data_stop_ids = s9_data['stop_id'].unique()
print(f"There are {len(s9_data_stop_ids):,} unique stop ids for the S9 line.")

There are 88 unique stop ids for the S9 line.


In [251]:
stops = pd.read_csv("../../data/processed/stops.csv")
s9_data = s9_data.merge(stops, on='stop_id')
s9_data_stop_names = s9_data['stop_name'].unique()
print(f"The stops for the S9 line are: {', '.join(s9_data_stop_names)}.")

The stops for the S9 line are: Lausanne, Pully-Nord, Moreillon, La Conversion, Grandvaux, Puidoux, Palézieux, Palézieux-Village, Châtillens, Ecublens-Rue, Moudon, Lucens, Granges-Marnand, Corcelles-Nord, Dompierre FR, Domdidier, Avenches, Faoug, Murten/Morat, Galmiz, Payerne, Muntelier-Löwenberg, Henniez, Kerzers, Sissach, Diepflingen, Sommerau, Rümlingen, Buckten, Läufelfingen, Trimbach, Olten, Zürich HB, Zürich Oerlikon, Zürich Hardbrücke, Glattbrugg, Rümlang, Oberglatt ZH, Niederglatt ZH, Bülach, Glattfelden, Eglisau, Hüntwangen-Wil, Rafz, Lottstetten, Jestetten, Neuhausen, Schaffhausen, Neuhausen Rheinfall, Zürich Stadelhofen, Uster, Nänikon-Greifensee, Schwerzenbach ZH, Dübendorf, Stettbach, Seon, Hallwil, Boniswil, Birrwil, Beinwil am See, Lenzburg, Waldibrücke, Ermensee, Emmenbrücke, Eschenbach LU, Ballwil, Hochdorf, Baldegg, Gelfingen, Hitzkirch, Emmenbrücke Gersag, Mosen, Hochdorf Schönau, Luzern, Baldegg Kloster, Wattwil, Lichtensteig, Dietfurt, Bütschwil, Lütisburg, Bazenhei

In [252]:
s9_data = s9_data.merge(stops, on='stop_id')
s9_data.sort_values(['trip_id', 'time']).head()

Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,...,n_additional_trip,n_entries,time,trip_index,stop_name_x,stop_lon_x,stop_lat_x,stop_name_y,stop_lon_y,stop_lat_y
0,85:11:14089:011_1,Train,S9,S,8501120,,00:04:00,,32.375,,...,40,40,00:04:00,0,Lausanne,2537875.0,1152042.0,Lausanne,2537875.0,1152042.0
85,85:11:14089:011_1,Train,S9,S,8504000,00:06:00,00:07:00,89.8,57.775,77.0,...,40,40,00:06:00,1,Pully-Nord,2540090.0,1151785.0,Pully-Nord,2540090.0,1151785.0
103,85:11:14089:011_1,Train,S9,S,8504010,00:09:00,00:09:00,55.75,76.2,49.0,...,40,40,00:09:00,2,La Conversion,2541649.0,1151592.0,La Conversion,2541649.0,1151592.0
112,85:11:14089:011_1,Train,S9,S,8504011,00:12:00,00:12:00,115.925,144.625,115.0,...,40,40,00:12:00,3,Grandvaux,2545034.0,1149767.0,Grandvaux,2545034.0,1149767.0
121,85:11:14089:011_1,Train,S9,S,8504012,00:16:00,00:16:00,102.325,132.925,97.0,...,40,40,00:16:00,4,Puidoux,2548335.0,1149392.0,Puidoux,2548335.0,1149392.0


## Into Network

In [289]:
def create_network_representation(data):
    # Create a new DataFrame with columns trip_id, trip_index, stop_id, and n_entries
    network_data = data[['trip_id', 'trip_index', 'stop_id', 'n_entries']].copy()

    # Add a new column next_stop_id that contains the stop_id of the trip_index i+1 for each trip_id
    network_data['next_stop_id'] = network_data.groupby('trip_id')['stop_id'].shift(-1)

    # Remove rows with missing next_stop_id
    network_data = network_data.dropna(subset=['next_stop_id'])

    # Group by stop_id, next_stop_id, and sum up the n_entries for duplicate edges
    network_data = network_data.groupby(['stop_id', 'next_stop_id'])['n_entries'].sum().reset_index()

    return network_data

network_data = create_network_representation(train_data)
network_data.head()

Unnamed: 0,stop_id,next_stop_id,n_entries
0,8500010,8500016.0,11443
1,8500010,8500020.0,43662
2,8500010,8500021.0,522
3,8500010,8500023.0,31297
4,8500010,8500090.0,33589


In [290]:
# merge with stops
network_data = network_data \
    .merge(stops, left_on='stop_id', right_on='stop_id') \
    .rename(columns={
        'stop_id': 'source',
        'stop_name': 'source_name'
        }) \
    .merge(stops, left_on='next_stop_id', right_on='stop_id') \
    .rename(columns={
        'stop_id': 'target',
        'stop_name': 'target_name'
    }) \
    .rename(columns={
        'n_entries': 'weight'
    })[['source', 'source_name', 'target', 'target_name', 'weight']]
network_data.head()

Unnamed: 0,source,source_name,target,target_name,weight
0,8500010,Basel SBB,8500016,Basel St. Johann,11443
1,8500010,Basel SBB,8500020,Muttenz,43662
2,8500010,Basel SBB,8500021,Pratteln,522
3,8500020,Muttenz,8500021,Pratteln,43361
4,8500010,Basel SBB,8500023,Liestal,31297


In [291]:
# Write in csv
network_data.to_csv("../../data/processed/network_edges.csv", index=False)

In [292]:
# create nodes
nodes_1 = network_data[['source', 'source_name']] \
    .drop_duplicates() \
    .rename(columns={
        'source': 'id',
        'source_name': 'label'
    })
nodes_2 = network_data[['target', 'target_name']] \
    .drop_duplicates() \
    .rename(columns={
        'target': 'id',
        'target_name': 'label'
    })
nodes = pd.concat([nodes_1, nodes_2]) \
    .drop_duplicates()
nodes.head()

Unnamed: 0,id,label
0,8500010,Basel SBB
3,8500020,Muttenz
6,8500022,Frenkendorf-Füllinsdorf
8,8500023,Liestal
10,8500104,Court


In [293]:
nodes.to_csv("../../data/processed/network_nodes.csv", index=False)