In [13]:
import pandas as pd
import requests
import json
import os
import io
from dotenv import load_dotenv
import logging
from sqlalchemy import create_engine, text

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
    roc_auc_score,
    roc_curve,
    auc,
    ConfusionMatrixDisplay
)

load_dotenv()

True

In [14]:
db_url = os.getenv("DATABASE_URL")

In [None]:
from sqlalchemy import text
engine = create_engine(
    db_url,
    pool_pre_ping=True,
)
with engine.begin() as conn:
    conn.execute(text("TRUNCATE TABLE stg_transport_"))


In [3]:
engine = create_engine(
    db_url,
    pool_pre_ping=True,
)

with engine.connect() as conn:
    result_transport = conn.execute(text("SELECT * FROM stg_transport_archive"))
    transport_data = result_transport.mappings().all()  # liste de dictionnaires

    result_weather = conn.execute(text("SELECT * FROM stg_weather_archive"))
    weather_data = result_weather.mappings().all()  # liste de dictionnaires

In [None]:
from sqlalchemy import text
db_url = os.getenv("DATABASE_URL")
engine = create_engine(
    db_url,
    pool_pre_ping=True,
)
with engine.begin() as conn:
    conn.execute(text("""
        ALTER TABLE stg_transport_archive
          DROP COLUMN IF EXISTS entity_id,
          DROP COLUMN IF EXISTS trip_id,
          DROP COLUMN IF EXISTS route_id,
          DROP COLUMN IF EXISTS stop_sequence,
          DROP COLUMN IF EXISTS stop_id,
          DROP COLUMN IF EXISTS stop_arrival_delay,
          DROP COLUMN IF EXISTS timestamp,
          DROP COLUMN IF EXISTS route_id_static,
          DROP COLUMN IF EXISTS route_short_name,
          DROP COLUMN IF EXISTS route_type,
          DROP COLUMN IF EXISTS timestamp_dt,
          DROP COLUMN IF EXISTS timestamp_rounded;
    """))

    conn.execute(text("""
        ALTER TABLE stg_transport_archive
          ADD COLUMN IF NOT EXISTS id BIGINT GENERATED ALWAYS AS IDENTITY,
          ADD COLUMN IF NOT EXISTS timestamp BIGINT,
          ADD COLUMN IF NOT EXISTS direction_id INT,
          ADD COLUMN IF NOT EXISTS stop_sequence INT,
          ADD COLUMN IF NOT EXISTS arrival_delay INT,
          ADD COLUMN IF NOT EXISTS departure_delay INT,
          ADD COLUMN IF NOT EXISTS datetime_rounded TIMESTAMPTZ,
          ADD COLUMN IF NOT EXISTS hour INT,
          ADD COLUMN IF NOT EXISTS bus_nbr TEXT;
    """))


with engine.begin() as conn:
    # Assure-toi que la colonne id existe
    conn.execute(text("""
        ALTER TABLE stg_transport_archive
        ADD COLUMN IF NOT EXISTS id BIGINT GENERATED ALWAYS AS IDENTITY;
    """))

    # Ajoute la PK seulement si pas déjà là
    conn.execute(text("""
    DO $$
    BEGIN
        IF NOT EXISTS (
            SELECT 1
            FROM pg_constraint
            WHERE conname = 'stg_transport_archive_pkey'
        ) THEN
            ALTER TABLE stg_transport_archive
            ADD CONSTRAINT stg_transport_archive_pkey PRIMARY KEY (id);
        END IF;
    END $$;
    """))



In [3]:
engine = create_engine(
    db_url,
    pool_pre_ping=True,
)

In [20]:
with engine.begin() as conn:
    conn.execute(text("""
        TRUNCATE TABLE stg_transport_realtime;
    """))

In [None]:
with engine.begin() as conn:
    conn.execute(text("""
        TRUNCATE TABLE stg_transport_archive;
    """))

In [36]:
with engine.begin() as conn:
    result = conn.execute(text("""
        SELECT COUNT(*) FROM stg_transport_archive;
    """))
print(result.scalar())

81576


In [31]:
with engine.begin() as conn:
    conn.execute(text("""
        ALTER TABLE stg_transport_realtime
          ADD COLUMN IF NOT EXISTS id BIGINT GENERATED ALWAYS AS IDENTITY,
          ADD COLUMN IF NOT EXISTS direction_id INT,
          ADD COLUMN IF NOT EXISTS stop_sequence INT,
          ADD COLUMN IF NOT EXISTS arrival_delay INT,
          ADD COLUMN IF NOT EXISTS departure_delay INT,
          ADD COLUMN IF NOT EXISTS timestamp_rounded TIMESTAMPTZ,
          ADD COLUMN IF NOT EXISTS hour INT,
          ADD COLUMN IF NOT EXISTS bus_nbr TEXT;
    """))



In [33]:
with engine.begin() as conn:
    conn.execute(text("""
        ALTER TABLE stg_transport_archive
          ADD COLUMN IF NOT EXISTS id BIGINT GENERATED ALWAYS AS IDENTITY,
          ADD COLUMN IF NOT EXISTS direction_id INT,
          ADD COLUMN IF NOT EXISTS stop_sequence INT,
          ADD COLUMN IF NOT EXISTS arrival_delay INT,
          ADD COLUMN IF NOT EXISTS departure_delay INT,
          ADD COLUMN IF NOT EXISTS timestamp_rounded TIMESTAMPTZ,
          ADD COLUMN IF NOT EXISTS hour INT,
          ADD COLUMN IF NOT EXISTS bus_nbr TEXT;
    """))

In [30]:
with engine.begin() as conn:
    conn.execute(text("""
        ALTER TABLE stg_transport_realtime
          DROP COLUMN IF EXISTS bus_nbr,
          DROP COLUMN IF EXISTS direction_id,
          DROP COLUMN IF EXISTS arrival_delay,
          DROP COLUMN IF EXISTS departure_delay,
          DROP COLUMN IF EXISTS stop_id,
          DROP COLUMN IF EXISTS stop_arrival_delay,
          DROP COLUMN IF EXISTS timestamp,
          DROP COLUMN IF EXISTS route_id_static,
          DROP COLUMN IF EXISTS route_short_name,
          DROP COLUMN IF EXISTS route_type,
          DROP COLUMN IF EXISTS timestamp_dt,
          DROP COLUMN IF EXISTS hour,
          DROP COLUMN IF EXISTS timestamp_rounded,
          DROP COLUMN IF EXISTS id,
          DROP COLUMN IF EXISTS stop_sequence,
          DROP COLUMN IF EXISTS datetime_rounded;
    """))

In [32]:
with engine.begin() as conn:
    conn.execute(text("""
        ALTER TABLE stg_transport_archive
          DROP COLUMN IF EXISTS bus_nbr,
          DROP COLUMN IF EXISTS direction_id,
          DROP COLUMN IF EXISTS arrival_delay,
          DROP COLUMN IF EXISTS departure_delay,
          DROP COLUMN IF EXISTS stop_id,
          DROP COLUMN IF EXISTS stop_arrival_delay,
          DROP COLUMN IF EXISTS timestamp,
          DROP COLUMN IF EXISTS route_id_static,
          DROP COLUMN IF EXISTS route_short_name,
          DROP COLUMN IF EXISTS route_type,
          DROP COLUMN IF EXISTS timestamp_dt,
          DROP COLUMN IF EXISTS hour,
          DROP COLUMN IF EXISTS timestamp_rounded,
          DROP COLUMN IF EXISTS id,
          DROP COLUMN IF EXISTS stop_sequence,
          DROP COLUMN IF EXISTS datetime_rounded;
    """))

In [None]:


with engine.begin() as conn:
    conn.execute(text("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'stg_tansport_realtime';
    """))

In [16]:
with engine.begin() as conn:
    result = conn.execute(text("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'stg_transport_realtime';
    """))

    for row in result:
        print(row)

('timestamp', 'bigint')
('stop_sequence', 'bigint')
('stop_arrival_delay', 'bigint')
('stop_id', 'text')
('route_short_name', 'text')
('entity_id', 'text')
('route_type', 'text')
('trip_id', 'text')
('route_id', 'text')


In [5]:
from sqlalchemy import text
import pandas as pd

with engine.begin() as conn:
    pk = conn.execute(text("""
        SELECT
            tc.constraint_name,
            kcu.column_name,
            kcu.ordinal_position
        FROM information_schema.table_constraints tc
        JOIN information_schema.key_column_usage kcu
          ON tc.constraint_name = kcu.constraint_name
         AND tc.table_schema = kcu.table_schema
        WHERE tc.constraint_type = 'PRIMARY KEY'
          AND tc.table_name = 'stg_transport_archive'
        ORDER BY kcu.ordinal_position;
    """)).fetchall()

pd.DataFrame(pk, columns=["constraint_name", "column", "position"])


Unnamed: 0,constraint_name,column,position
0,stg_transport_archive_pkey,id,1


In [4]:
df_transport = pd.DataFrame(transport_data)
df_transport.sample(20)

Unnamed: 0,entity_id,route_id,route_id_static,route_short_name,route_type,stop_arrival_delay,stop_id,stop_sequence,timestamp,timestamp_dt,timestamp_rounded,trip_id
1324788,14010516835154226,,,,,-22.0,9022001070418002,7,1741046942,2025-03-04 00:09:02,2025-03-04 00:00:00,14010000676771628
1011315,14010516676062371,,9011001072300000.0,,,0.0,9022001070443002,9,1741045742,2025-03-03 23:49:02,2025-03-03 23:00:00,14010000673295447
633167,14010516852974310,,,,,137.0,9022001012431001,9,1741044476,2025-03-03 23:27:56,2025-03-03 23:00:00,14010000675810720
119766,14010516852795073,,,,,60.0,9022001012121002,5,1741043002,2025-03-03 23:03:22,2025-03-03 23:00:00,14010000675810420
642446,14010516891115914,,9011001047100000.0,,,-62.0,9022001040253002,12,1741044485,2025-03-03 23:28:05,2025-03-03 23:00:00,14010000684966388
709312,14010516535517126,,,,,-3.0,9022001062549001,7,1741044709,2025-03-03 23:31:49,2025-03-03 23:00:00,14010000653932051
1167443,14010516465522386,,,,,-49.0,9022001015621003,39,1741046380,2025-03-03 23:59:40,2025-03-03 23:00:00,14010000669426114
2143444,14010516679448438,,,,,142.0,9022001013851002,33,1741051719,2025-03-04 01:28:39,2025-03-04 01:00:00,14010000673297309
187965,14010516703439326,,,,,36.0,9022001080401001,63,1741043223,2025-03-03 23:07:03,2025-03-03 23:00:00,14010000673650991
786841,14010516891370383,,,,,56.0,9022001041067002,7,1741044957,2025-03-03 23:35:57,2025-03-03 23:00:00,14010000684966467


In [5]:
df_weather = pd.DataFrame(weather_data)
df_weather.sample(20)

Unnamed: 0,cloud_cover,day_of_week,dew_point_2m,est_jour_ferie,est_weekend,month,neige_fondue,precipitation,rain,risque_gel_neige,...,soleil_leve,temperature_2m,timestamp_rounded,uv_index,vacances_scolaires,weather_code,wind_direction_10m,wind_gusts_10m,wind_speed_10m,year
7046,100,5,1.6,0,1,10,0,0.0,0.0,0,...,1,5.6,2023-10-21 14:00:00,,0,3,83,53.3,24.3,2023
18579,100,3,-4.9,0,0,2,0,0.0,0.0,0,...,0,-3.2,2025-02-13 03:00:00,,0,3,2,11.2,4.5,2025
23862,99,6,14.1,1,1,9,0,0.0,0.0,0,...,1,15.1,2025-09-21 06:00:00,,0,3,221,31.7,15.8,2025
25810,100,3,5.9,0,0,12,0,0.0,0.0,0,...,1,7.3,2025-12-11 10:00:00,,0,3,245,28.8,14.3,2025
25439,100,1,-1.7,0,0,11,0,0.0,0.0,0,...,0,-1.4,2025-11-25 23:00:00,,0,3,298,9.4,4.9,2025
14128,31,6,8.4,1,1,8,0,0.0,0.0,0,...,1,20.8,2024-08-11 16:00:00,,1,1,299,38.2,17.7,2024
10216,100,4,2.8,0,0,3,0,0.0,0.0,0,...,1,3.3,2024-03-01 16:00:00,,1,3,139,29.5,15.3,2024
23901,0,0,3.8,0,0,9,0,0.0,0.0,0,...,0,10.4,2025-09-22 21:00:00,,0,0,278,25.6,11.6,2025
836,69,5,-8.9,0,1,2,0,0.0,0.0,0,...,0,-6.8,2023-02-04 20:00:00,,0,2,253,16.2,9.8,2023
23698,95,6,14.0,1,1,9,0,0.0,0.0,0,...,1,17.7,2025-09-14 10:00:00,,0,3,153,29.2,11.3,2025


In [6]:
df_merged = pd.merge(
    df_transport,
    df_weather,
    on="timestamp_rounded",        # colonne commune
    how="left"     # type de jointure
)

df_merged = df_merged.drop(columns=[
    "entity_id", "route_id_static", 
    "timestamp", "timestamp_dt", "trip_id", 
    "route_id", "year", "uv_index", 
    "shortwave_radiation"])
df_merged.sample(20)

Unnamed: 0,route_short_name,route_type,stop_arrival_delay,stop_id,stop_sequence,timestamp_rounded,cloud_cover,day_of_week,dew_point_2m,est_jour_ferie,...,risque_gel_neige,risque_gel_pluie,snowfall,soleil_leve,temperature_2m,vacances_scolaires,weather_code,wind_direction_10m,wind_gusts_10m,wind_speed_10m
1318522,,,-37.0,9022001084045001,11,2025-03-04 00:00:00,100,1,0.0,0,...,0,0,0.0,0,4.2,0,3,263,46.4,22.3
649728,,,192.0,9022001070595001,22,2025-03-03 23:00:00,100,0,0.2,0,...,0,0,0.0,0,4.3,0,3,263,45.0,21.8
2156280,,,-54.0,9022001014011002,19,2025-03-04 01:00:00,100,1,-0.1,0,...,0,0,0.0,0,4.2,0,3,265,50.0,24.2
2515545,,,-598.0,9022001050366014,1,2025-03-04 02:00:00,100,1,0.8,0,...,0,0,0.0,0,5.6,0,3,267,53.6,25.4
1293425,,,4140.0,9022001005171001,18,2025-03-04 00:00:00,100,1,0.0,0,...,0,0,0.0,0,4.2,0,3,263,46.4,22.3
419058,,,54.0,9022001051413001,3,2025-03-03 23:00:00,100,0,0.2,0,...,0,0,0.0,0,4.3,0,3,263,45.0,21.8
1578702,,,34.0,9022001070692001,21,2025-03-04 00:00:00,100,1,0.0,0,...,0,0,0.0,0,4.2,0,3,263,46.4,22.3
1210091,,,226.0,9022001010761004,4,2025-03-04 00:00:00,100,1,0.0,0,...,0,0,0.0,0,4.2,0,3,263,46.4,22.3
4637,,,0.0,9022001005061001,8,2025-03-03 22:00:00,100,0,0.6,0,...,0,0,0.0,0,4.7,0,3,266,46.8,21.6
138981,18.0,401.0,0.0,9022001001611001,49,2025-03-03 21:00:00,100,0,1.0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6


In [7]:
df_merged['stop_arrival_delay'] = df_merged['stop_arrival_delay']/60

In [8]:
df_merged.dtypes

route_short_name              object
route_type                    object
stop_arrival_delay           float64
stop_id                       object
stop_sequence                  int64
timestamp_rounded     datetime64[ns]
cloud_cover                    int64
day_of_week                    int64
dew_point_2m                 float64
est_jour_ferie                 int64
est_weekend                    int64
month                          int64
neige_fondue                   int64
precipitation                float64
rain                         float64
risque_gel_neige               int64
risque_gel_pluie               int64
snowfall                     float64
soleil_leve                    int64
temperature_2m               float64
vacances_scolaires             int64
weather_code                   int64
wind_direction_10m             int64
wind_gusts_10m               float64
wind_speed_10m               float64
dtype: object

In [9]:
df_merged.groupby('route_short_name').count()

Unnamed: 0_level_0,route_type,stop_arrival_delay,stop_id,stop_sequence,timestamp_rounded,cloud_cover,day_of_week,dew_point_2m,est_jour_ferie,est_weekend,...,risque_gel_neige,risque_gel_pluie,snowfall,soleil_leve,temperature_2m,vacances_scolaires,weather_code,wind_direction_10m,wind_gusts_10m,wind_speed_10m
route_short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,22401,22401,22401,22401,22401,22401,22401,22401,22401,22401,...,22401,22401,22401,22401,22401,22401,22401,22401,22401,22401
17,821,509,821,821,821,821,821,821,821,821,...,821,821,821,821,821,821,821,821,821,821
18,32114,32114,32114,32114,32114,32114,32114,32114,32114,32114,...,32114,32114,32114,32114,32114,32114,32114,32114,32114,32114
19,66598,65910,66598,66598,66598,66598,66598,66598,66598,66598,...,66598,66598,66598,66598,66598,66598,66598,66598,66598,66598


In [10]:
df_filtered = df_merged[df_merged['route_short_name'].isin(['14', '18'])]
#df_filtered = df_merged[df_merged['route_short_name'].isin(['17'])]

In [11]:
df_filtered

Unnamed: 0,route_short_name,route_type,stop_arrival_delay,stop_id,stop_sequence,timestamp_rounded,cloud_cover,day_of_week,dew_point_2m,est_jour_ferie,...,risque_gel_neige,risque_gel_pluie,snowfall,soleil_leve,temperature_2m,vacances_scolaires,weather_code,wind_direction_10m,wind_gusts_10m,wind_speed_10m
122,18,401,0.000000,9022001001951002,1,2025-03-03 21:00:00,100,0,1.0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
123,18,401,0.000000,9022001001951001,2,2025-03-03 21:00:00,100,0,1.0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
124,18,401,0.000000,9022001001941001,3,2025-03-03 21:00:00,100,0,1.0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
125,18,401,0.000000,9022001001941002,4,2025-03-03 21:00:00,100,0,1.0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
126,18,401,0.000000,9022001001931001,5,2025-03-03 21:00:00,100,0,1.0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2737242,14,401,138.183333,9022001002811002,15,2025-03-04 02:00:00,100,1,0.8,0,...,0,0,0.0,0,5.6,0,3,267,53.6,25.4
2737243,14,401,138.166667,9022001002821002,16,2025-03-04 02:00:00,100,1,0.8,0,...,0,0,0.0,0,5.6,0,3,267,53.6,25.4
2737244,14,401,139.950000,9022001002831002,17,2025-03-04 02:00:00,100,1,0.8,0,...,0,0,0.0,0,5.6,0,3,267,53.6,25.4
2737245,14,401,140.116667,9022001002841002,18,2025-03-04 02:00:00,100,1,0.8,0,...,0,0,0.0,0,5.6,0,3,267,53.6,25.4


In [18]:
#############
##TO REMOVE##
#############

df_merged['route_type'] = df_merged['route_type'].fillna(700)
df_merged['route_short_name'] = df_merged['route_short_name'].fillna(0)
df_merged['stop_arrival_delay'] = df_merged['stop_arrival_delay'].fillna(0)
df_merged['stop_id'] = df_merged['stop_id'].fillna(0)
df_merged['stop_sequence'] = df_merged['stop_sequence'].fillna(0)
df_merged['cloud_cover'] = df_merged['cloud_cover'].fillna(0)
df_merged['day_of_week'] = df_merged['day_of_week'].fillna(0)


df_merged['route_short_name'] = df_merged['route_short_name'].astype(int) 
df_merged['route_type'] = df_merged['route_type'].astype(int) 
df_merged['stop_id'] = df_merged['stop_id'].astype(int) 

####

df_filtered['route_type'] = df_filtered['route_type'].fillna(700)
df_filtered['route_short_name'] = df_filtered['route_short_name'].fillna(0)
df_filtered['stop_arrival_delay'] = df_filtered['stop_arrival_delay'].fillna(0)
df_filtered['stop_id'] = df_filtered['stop_id'].fillna(0)
df_filtered['stop_sequence'] = df_filtered['stop_sequence'].fillna(0)
df_filtered['cloud_cover'] = df_filtered['cloud_cover'].fillna(0)
df_filtered['day_of_week'] = df_filtered['day_of_week'].fillna(0)

df_filtered['hour'] = df_filtered['timestamp_rounded'].dt.hour
df_filtered['day'] = df_filtered['timestamp_rounded'].dt.day
df_filtered['month'] = df_filtered['timestamp_rounded'].dt.month
df_filtered = df_filtered.drop(columns=['timestamp_rounded'])


df_filtered['route_short_name'] = df_filtered['route_short_name'].astype(int) 
df_filtered['route_type'] = df_filtered['route_type'].astype(int) 
df_filtered['stop_id'] = df_filtered['stop_id'].astype(int) 
#############
##TO REMOVE##
#############

In [13]:
feature_target = 'stop_arrival_delay'
X = df_filtered.loc[:, df_filtered.columns != feature_target]
y = df_filtered[feature_target]

X.head()

Unnamed: 0,route_short_name,route_type,stop_id,stop_sequence,timestamp_rounded,cloud_cover,day_of_week,dew_point_2m,est_jour_ferie,est_weekend,...,risque_gel_neige,risque_gel_pluie,snowfall,soleil_leve,temperature_2m,vacances_scolaires,weather_code,wind_direction_10m,wind_gusts_10m,wind_speed_10m
122,18,401,9022001001951002,1,2025-03-03 21:00:00,100,0,1.0,0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
123,18,401,9022001001951001,2,2025-03-03 21:00:00,100,0,1.0,0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
124,18,401,9022001001941001,3,2025-03-03 21:00:00,100,0,1.0,0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
125,18,401,9022001001941002,4,2025-03-03 21:00:00,100,0,1.0,0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6
126,18,401,9022001001931001,5,2025-03-03 21:00:00,100,0,1.0,0,0,...,0,0,0.0,0,5.1,0,3,262,47.2,22.6


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

target = 'stop_arrival_delay'

categorical_cols = ['route_short_name', 'route_type', 'stop_id']
numeric_cols = [c for c in df_filtered.columns if c not in categorical_cols + [target]]

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge(alpha=1.0))
])


In [22]:
df = df_filtered.sort_values(['month', 'day_of_week', 'hour'])

X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [1]:
from sqlalchemy import text

with engine.begin() as conn:
    conn.execute(text("TRUNCATE TABLE stg_transport_archive"))


NameError: name 'engine' is not defined