# 🧼 Data Preprocessing Pipeline

In this section, we use the preprocessing pipeline to preprocess the data. A sequence of steps is applied to the data to clean it and prepare it for further analysis.

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import sys
import os

path_to_preprocessing = os.path.join('..', '..', 'src')
sys.path.insert(0, path_to_preprocessing)

from preprocessing.preprocessing_pipeline import preprocess_files
from preprocessing.merge_data import merge_and_save_operator_data, merge_and_save_stop_data, merge_and_save_transport_data

In [95]:
preprocess_files(
    overwrite_existing_file=True,
    print_progress=False
    )

📁 Found 253 valid file(s) in the data folder.


In [130]:
merge_and_save_operator_data()

In [167]:
merge_and_save_stop_data()

In [1]:
import pandas as pd

data = pd.read_parquet("../../data/processed/transports.parquet")
data.head()

Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries
0,80:06____:17171:000,Train,RB,RB,8500090,14:50:00,,293.939394,,120.0,,388.229414,,68,0,4,0,0,104
1,80:06____:17261:000,Train,RB,RB,8500090,,15:53:00,,61.621622,,0.0,,129.218022,0,9,1,0,0,104
2,80:800693:3053:000,Train,IRE3,IRE,8503424,11:58:00,12:00:00,151.539474,127.605263,41.0,19.0,627.797068,622.499501,60,73,2,0,0,78
3,80:80____:2887:000,Train,ICE,ICE,8500090,22:46:00,,1080.0,,1080.0,,,,1,0,0,0,1,1
4,80:sbg034:14004,Bus,Bus7349,B,8573327,09:07:00,,2.4,,0.0,,29.44332,,5,0,0,0,0,100


In [15]:
data.shape[0]

65804937

In [24]:
data_sub = data[data["n_entries"] > 10]

In [25]:
data_sub.shape[0]

9580419

In [26]:
res = data_sub.groupby(['trip_id', 'stop_id']) \
    .agg({'product_id': 'count'}) \
    .rename(columns={'product_id': 'count'}) \
    .reset_index()

res.sort_values('count', ascending=False).head(10)

Unnamed: 0,trip_id,stop_id,count
3964126,85:834:55232,8504723,12
3996553,85:834:65010,8588341,10
3999954,85:834:65142,8588341,10
3964052,85:834:55226,8504723,10
3999645,85:834:65130,8588341,10
3997165,85:834:65034,8588341,10
4000263,85:834:65154,8588341,10
2197198,85:801:205-1657,8572502,9
2212774,85:801:213-1657,8572502,9
2423327,85:801:315-1658,8572502,9


In [28]:
data_sub[(data_sub['trip_id'] == '85:834:55232') & (data_sub['stop_id'] == 8504723)]

Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries
9755472,85:834:55232,Bus,552,B,8504723,17:09:00,17:09:00,98.296296,113.481481,97.0,114.0,43.692822,45.907407,26,26,0,0,0,27
10413864,85:834:55232,Bus,552,B,8504723,18:09:00,18:09:00,118.108108,129.972973,94.0,109.0,73.132978,74.005438,37,37,0,0,0,37
17322532,85:834:55232,Bus,552,B,8504723,18:04:00,18:04:00,198.762712,212.898305,199.0,216.0,91.35179,92.168454,58,58,0,0,0,59
18638453,85:834:55232,Bus,552,B,8504723,17:57:00,17:57:00,162.847458,167.576271,176.0,178.0,96.660232,97.871521,55,55,0,0,0,59
31138708,85:834:55232,Bus,552,B,8504723,16:55:00,16:55:00,89.2,89.2,80.0,80.0,95.433416,95.433416,17,17,0,0,0,20
32126075,85:834:55232,Bus,552,B,8504723,16:57:00,16:57:00,171.217391,177.0,169.0,169.0,119.645757,120.958762,43,43,0,0,0,46
36073666,85:834:55232,Bus,552,B,8504723,17:02:00,17:02:00,135.2,139.85,92.0,95.0,116.391626,122.780195,20,20,0,0,0,20
44963072,85:834:55232,Bus,552,B,8504723,18:02:00,18:02:00,132.78,137.14,117.0,117.0,96.219411,100.981916,49,49,0,0,0,50
46609418,85:834:55232,Bus,552,B,8504723,16:58:00,16:58:00,181.0,187.296296,185.0,193.0,93.121096,90.377761,26,26,0,0,0,27
53849296,85:834:55232,Bus,552,B,8504723,17:58:00,17:58:00,207.486486,214.864865,207.0,211.0,125.954494,128.212966,36,36,0,0,0,37
