In [None]:
import os
from supabase import create_client, Client
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm

import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("muted")
load_dotenv()

In [None]:
url: str = os.getenv("SUPABASE_URL")
key: str = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [None]:
def get_all_data_paginated(table_name: str, page_size: int = 9999):
        all_data = []
        offset = 0
        while True:
            response = supabase.from_(table_name).select("*").range(offset, offset + page_size - 1).execute()
            data = response.data
            if not data:
                break
            all_data.extend(data)
            offset += page_size
        return all_data

In [None]:
trips = get_all_data_paginated("trips")

In [None]:
# transform the data into a pandas DataFrame
df = pd.DataFrame(trips)

# format columns
df['date'] = pd.to_datetime(df['date']).dt.date
df['theorical_arrival_time'] = pd.to_datetime(df['theorical_arrival_time']).dt.time
df['theorical_departure_time'] = pd.to_datetime(df['theorical_departure_time']).dt.time
df['actual_arrival_time'] = pd.to_datetime(df['actual_arrival_time']).dt.time
df['actual_departure_time'] = pd.to_datetime(df['actual_departure_time']).dt.time
df

In [None]:
df['route'] = df['initial_departure_station'] + ' - ' + df['final_arrival_station']

In [None]:
# get number of days in the dataset
days = df['date'].nunique()
print(f"Number of days in the dataset: {days}")

In [None]:
df[df.sequence == 1]['route'].value_counts()

In [None]:
# distribution of delay frequency for each route
routes = df[df.station_name == df.final_arrival_station]['route'].unique()
n_routes = len(routes)
n_cols = 3
n_rows = int(np.ceil(n_routes / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows), squeeze=False)
for idx, route in enumerate(routes):
    row, col = divmod(idx, n_cols)
    group = df[(df.station_name == df.final_arrival_station) & (df.route == route)]
    sns.histplot(group['arrival_delay'], bins=30, kde=True, ax=axes[row, col])
    axes[row, col].set_title(f'Distribution of Arrival Delay for Route: {route}')
    axes[row, col].set_xlabel('Arrival Delay (minutes)')
    axes[row, col].set_ylabel('Frequency')

# Hide any unused subplots
for idx in range(n_routes, n_rows * n_cols):
    row, col = divmod(idx, n_cols)
    axes[row, col].set_visible(False)

plt.tight_layout()
plt.show()




In [None]:
import geopy.distance

In [None]:
coords_1 = (33.5895156, -7.5911789)
coords_2 = (32.994123, -7.623379)

print(geopy.distance.geodesic(coords_1, coords_2).km)