In [191]:
import os
import warnings
import hashlib
from dotenv import load_dotenv
from supabase import create_client, Client
import numpy as np
from tqdm import tqdm
import pandas as pd

warnings.filterwarnings("ignore")
load_dotenv()

True

In [192]:
url: str = os.getenv("SUPABASE_URL")
key: str = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [193]:
def get_all_data_paginated(table_name: str, page_size: int = 9999):
        all_data = []
        offset = 0
        while True:
            response = supabase.from_(table_name).select("*").range(offset, offset + page_size - 1).execute()
            data = response.data
            if not data:
                break
            all_data.extend(data)
            offset += page_size
        return all_data

In [194]:
trips = get_all_data_paginated("trips")
df = pd.DataFrame(trips)

In [195]:
df.sort_values(by=['date', 'train_id', 'sequence'], inplace=True)
subtrip_rows = []
for train_id, group in tqdm(df.groupby('train_id')):
    for i in range(len(group) - 1):
        row = group.iloc[i]
        next_row = group.iloc[i + 1]
        if next_row['sequence'] != row['sequence'] + 1:
            continue
        subtrip = row.copy()

        subtrip['next_station_name'] = next_row['station_name']

        # Adjust the departure and arrival times for the subtrip
        subtrip['scheduled_departure_time'] = row['scheduled_departure_time']
        subtrip['actual_departure_time'] = row['actual_departure_time']
        subtrip['scheduled_arrival_time'] = next_row['scheduled_arrival_time']
        subtrip['actual_arrival_time'] = next_row['actual_arrival_time']
        subtrip['sequence'] = row['sequence']
        subtrip['departure_delay'] = row['departure_delay']
        subtrip['arrival_delay'] = next_row['arrival_delay']

        # Append the new subtrip row
        subtrip_rows.append(subtrip)
df = pd.DataFrame(subtrip_rows)

100%|██████████| 239/239 [01:44<00:00,  2.28it/s]


In [196]:
# rename columns for clarity
df.rename(columns={
    'id': 'subtrip_id',
    'station_name': 'current_station',
    'next_station_name': 'next_station'}, inplace=True)

# Convert date and time columns to appropriate formats
df['date'] = pd.to_datetime(df['date']).dt.date
df['scheduled_arrival_time'] = pd.to_datetime(df['scheduled_arrival_time']).dt.time
df['scheduled_departure_time'] = pd.to_datetime(df['scheduled_departure_time']).dt.time
df['actual_arrival_time'] = pd.to_datetime(df['actual_arrival_time']).dt.time
df['actual_departure_time'] = pd.to_datetime(df['actual_departure_time']).dt.time
df['arrival_hour'] = pd.to_datetime(df['scheduled_arrival_time'].astype(str), errors='coerce').dt.hour
df['departure_hour'] = pd.to_datetime(df['scheduled_departure_time'].astype(str), errors='coerce').dt.hour


# add additional columns
df['trip_id'] = df.apply(lambda row: hashlib.sha256(f"{row['date']}_{row['initial_departure_station']}_{row['final_arrival_station']}_{row['train_id']}".encode()).hexdigest(), axis=1)
df['route'] = df['initial_departure_station'] + ' - ' + df['final_arrival_station']
df['day_of_week'] = pd.to_datetime(df['date']).dt.weekday
df['number_of_stations'] = df.groupby('trip_id')['current_station'].transform('nunique') + 1

# convert scheduled arrival and departure times to datetime, using date as year, month, day
df['scheduled_arrival_time'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['scheduled_arrival_time'].astype(str), format='%Y-%m-%d %H:%M:%S')
df['scheduled_departure_time'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['scheduled_departure_time'].astype(str), format='%Y-%m-%d %H:%M:%S')
df['scheduled_arrival_time_diff'] = df['scheduled_arrival_time'] - df.groupby('trip_id')['scheduled_departure_time'].transform('first')
df['scheduled_departure_time_diff'] = df['scheduled_departure_time'] - df.groupby('trip_id')['scheduled_departure_time'].transform('first')
# Adjust scheduled times to avoid negative differences
for row in tqdm(df.itertuples(), total=len(df), desc="Adjusting scheduled times"):
    if getattr(row, 'scheduled_arrival_time_diff') < pd.Timedelta(0):
        df.loc[df.subtrip_id == row.subtrip_id, 'scheduled_arrival_time'] += pd.Timedelta(days=1)
    if getattr(row, 'scheduled_departure_time_diff') < pd.Timedelta(0):
        df.loc[df.subtrip_id == row.subtrip_id, 'scheduled_departure_time'] += pd.Timedelta(days=1)

# Add trip duration in minutes
df['trip_duration'] = None
durations = dict()
for _id in tqdm(df['trip_id'].unique(), desc="Calculating trip durations"):
    trip_data = df[df['trip_id'] == _id].sort_values(by='sequence')
    start_time = pd.to_datetime(trip_data['scheduled_departure_time'].iloc[0].strftime('%H:%M:%S'), format='%H:%M:%S')
    end_time = pd.to_datetime(trip_data['scheduled_arrival_time'].iloc[-1].strftime('%H:%M:%S'), format='%H:%M:%S')
    trip_duration = (end_time - start_time).total_seconds() / 60  # in minutes
    durations[_id] = trip_duration
df['trip_duration'] = df['trip_id'].map(durations)

df['subtrip_duration'] = df.apply(lambda row: pd.to_datetime(row['scheduled_arrival_time'].strftime('%H:%M:%S'), format='%H:%M:%S') - pd.to_datetime(row['scheduled_departure_time'].strftime('%H:%M:%S'), format='%H:%M:%S'), axis=1)
df['subtrip_duration'] = df['subtrip_duration'].dt.total_seconds() / 60

# calculate distance in km for each trip and subtrip
with open('distances.csv', 'r') as f:
    distances = pd.read_csv(f)
_distance = dict()
for _id in tqdm(df['trip_id'].unique(), desc="Calculating trip distances"):

    _distance[_id] = distances[distances.initial_departure_station == df[df['trip_id'] == _id]['initial_departure_station'].iloc[0]][distances.final_arrival_station == df[df['trip_id'] == _id]['final_arrival_station'].iloc[0]].iloc[0]['distance']
df['trip_distance'] = df['trip_id'].map(_distance)

with open('subtrip_distances.csv', 'r') as f:
    subtrip_distances = pd.read_csv(f)
df['subtrip_distance'] = df.apply(lambda row: subtrip_distances[(subtrip_distances['departure_station'] == row['current_station']) & (subtrip_distances['arrival_station'] == row['next_station'])]['distance'].values[0] , axis=1)

Adjusting scheduled times: 100%|██████████| 124073/124073 [03:48<00:00, 543.24it/s] 
Calculating trip durations: 100%|██████████| 13920/13920 [03:48<00:00, 60.87it/s]
Calculating trip distances: 100%|██████████| 13920/13920 [07:35<00:00, 30.57it/s]


In [197]:
# add dwelling time
df['scheduled_dwelling_time'] = df['scheduled_departure_time'] - df.groupby('trip_id')['scheduled_arrival_time'].shift(1)
df['scheduled_dwelling_time'] = df['scheduled_dwelling_time'].dt.total_seconds() // 60  # convert to minutes
df['scheduled_dwelling_time'].fillna(0, inplace=True)

# add travelled and remaining distance
df['remaining_distance'] = df['trip_distance'] - df.groupby('trip_id')['subtrip_distance'].cumsum() + df['subtrip_distance']
df['travelled_distance'] = df.groupby('trip_id')['subtrip_distance'].cumsum()

In [198]:
df = df[['trip_id', 'subtrip_id', 'date', 'day_of_week', 'train_id', 'train_type', 'initial_departure_station', 'final_arrival_station', 'route', 'current_station', 'next_station', 'sequence', 'number_of_stations', 'trip_duration', 'subtrip_duration', 'trip_distance', 'subtrip_distance', 'travelled_distance', 'remaining_distance', 'scheduled_departure_time', 'scheduled_arrival_time', 'actual_departure_time', 'actual_arrival_time',  'departure_delay', 'arrival_delay', 'departure_hour', 'arrival_hour', 'scheduled_dwelling_time']]

In [199]:
weather = pd.read_csv('weather_data.csv')
weather

Unnamed: 0,date,latitude,longitude,station_name,timezone,timezone_abbreviation,temperature,relative_humidity,dew_point,apparent_temperature,precipitation,visibility,wind_speed,wind_direction,wind_gusts,uv_index,cloud_cover,surface_pressure
0,2025-05-18 00:00:00+01:00,31.6250,-8.0,MARRAKECH,b'Africa/Casablanca',b'GMT+1',19.371500,66.0,12.857771,19.605425,0.0,56440.0,4.582052,224.999900,10.799999,0.00,3.0,962.68420
1,2025-05-18 01:00:00+01:00,31.6250,-8.0,MARRAKECH,b'Africa/Casablanca',b'GMT+1',18.621500,69.0,12.822044,18.805630,0.0,50920.0,4.843305,228.012860,11.520000,0.00,0.0,962.08075
2,2025-05-18 02:00:00+01:00,31.6250,-8.0,MARRAKECH,b'Africa/Casablanca',b'GMT+1',17.771502,72.0,12.657198,17.877752,0.0,47080.0,5.001280,239.743650,12.599999,0.00,20.0,961.74380
3,2025-05-18 03:00:00+01:00,31.6250,-8.0,MARRAKECH,b'Africa/Casablanca',b'GMT+1',17.321500,72.0,12.224657,17.416338,0.0,46220.0,4.104631,232.124950,11.879999,0.00,8.0,961.28610
4,2025-05-18 04:00:00+01:00,31.6250,-8.0,MARRAKECH,b'Africa/Casablanca',b'GMT+1',16.971500,73.0,12.097652,17.088531,0.0,45400.0,3.671294,258.690100,9.720000,0.00,0.0,961.13020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153643,2025-07-22 19:00:00+01:00,34.5625,-3.0,MELG EL OUIDANE,b'Africa/Casablanca',b'GMT+1',30.908500,32.0,12.301646,27.427150,0.0,77880.0,28.739187,12.295259,47.160000,1.85,84.0,985.82510
153644,2025-07-22 20:00:00+01:00,34.5625,-3.0,MELG EL OUIDANE,b'Africa/Casablanca',b'GMT+1',29.258501,40.0,14.270977,27.158665,0.0,75860.0,23.936617,12.154927,44.280000,0.45,100.0,986.26470
153645,2025-07-22 21:00:00+01:00,34.5625,-3.0,MELG EL OUIDANE,b'Africa/Casablanca',b'GMT+1',28.358500,44.0,14.939544,27.668800,0.0,75480.0,15.978486,14.349371,36.360000,0.00,100.0,987.06146
153646,2025-07-22 22:00:00+01:00,34.5625,-3.0,MELG EL OUIDANE,b'Africa/Casablanca',b'GMT+1',28.058500,39.0,12.811695,27.077387,0.0,76100.0,12.768586,21.501507,28.080000,0.00,100.0,987.52124


In [200]:
weather['hour'] = pd.to_datetime(weather['date']).dt.hour
weather['date'] = pd.to_datetime(weather['date']).dt.date
weather.drop(columns=['latitude', 'longitude', 'timezone', 'timezone_abbreviation'], inplace=True)
weather.columns

Index(['date', 'station_name', 'temperature', 'relative_humidity', 'dew_point',
       'apparent_temperature', 'precipitation', 'visibility', 'wind_speed',
       'wind_direction', 'wind_gusts', 'uv_index', 'cloud_cover',
       'surface_pressure', 'hour'],
      dtype='object')

In [203]:
df = df.merge(weather.add_suffix('_on_departure'), left_on=['date', 'departure_hour', 'current_station'], right_on=['date_on_departure', 'hour_on_departure', 'station_name_on_departure'], how='left')
df = df.merge(weather.add_suffix('_on_arrival'), left_on=['date', 'arrival_hour', 'next_station'], right_on=['date_on_arrival', 'hour_on_arrival', 'station_name_on_arrival'], how='left')
df.drop(columns=['date_on_departure', 'hour_on_departure', 'station_name_on_departure', 'date_on_arrival', 'hour_on_arrival', 'station_name_on_arrival'], inplace=True)

In [204]:
df.head(10)

Unnamed: 0,trip_id,subtrip_id,date,day_of_week,train_id,train_type,initial_departure_station,final_arrival_station,route,current_station,...,dew_point_on_arrival,apparent_temperature_on_arrival,precipitation_on_arrival,visibility_on_arrival,wind_speed_on_arrival,wind_direction_on_arrival,wind_gusts_on_arrival,uv_index_on_arrival,cloud_cover_on_arrival,surface_pressure_on_arrival
0,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,9ecce6061fb759c37f51dcd0578a3514724cb3d7d3da19...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,CASA PORT,...,14.521266,17.95211,0.0,23160.0,5.411986,93.814,14.04,0.0,62.0,1014.9131
1,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,c457d66f9f93dff1bcb117f0c6b78947f552efd2634588...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,AIN SEBAA,...,14.502754,17.370037,0.0,17040.0,4.198285,120.96369,11.159999,0.0,62.0,1017.2997
2,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,4801ea335f904b86b6ae8e7afeeaf483df6e521145cd08...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,MOHAMMEDIA,...,14.242029,17.002121,0.0,19620.0,6.725354,105.52418,11.879999,0.0,60.0,1013.9488
3,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,098075ef8b0b1a9171cd5df6955895561f5dce9689fd02...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,BOUZNIKA,...,13.803034,16.893068,0.0,21980.0,4.582052,135.0001,10.799999,0.05,64.0,1014.1832
4,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,08eec44849939fe9c3e0951a65d788facd5c8f0f672bd1...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,SKHIRAT,...,13.732921,16.79669,0.0,21980.0,4.582052,135.0001,10.799999,0.05,64.0,1012.8673
5,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,b74581bc5cd440eaceb7c1b639f502aeb6a49f44fc2fc4...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,TEMARA,...,14.07438,17.858753,0.0,24580.0,3.096837,125.537766,7.559999,0.05,70.0,1016.58356
6,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,dda781f00eb53be80258a9ca7aa9fbfb154d4309ecbf5e...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,RABAT AGDAL,...,14.155621,17.484896,0.0,19720.0,2.595997,146.3099,5.76,0.05,61.0,1016.31995
7,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,57afa0e6cf4b3add3212ff3814da23b9dab01da1722ecd...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,RABAT VILLE,...,14.087091,17.876343,0.0,24580.0,3.096837,125.537766,7.559999,0.05,70.0,1016.8232
8,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,5e6c5354151b797cfb5cdab8e46da1cc71bca0a1b58135...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,SALE,...,14.07438,17.858753,0.0,24580.0,3.096837,125.537766,7.559999,0.05,70.0,1016.58356
9,475a318f125109daa63f2d22bbc975d685afbc3e2aadb5...,a4c19364c73cbe258a54d46674352723512d939883b152...,2025-05-19,0,1,TNR,CASA PORT,KENITRA,CASA PORT - KENITRA,SALE TABRIQUET,...,14.391384,18.256615,0.0,20200.0,0.804984,333.43503,3.6,0.5,82.0,1017.30176


In [205]:
df.to_csv('../data/trips_data.csv', index=False)