---
# Predicting SNCF wait times
---
In this notebook, we try to build a pipeline and framework in order to predict wait times for trains. 

We start by importing the data and splitting it. We then remove outliers and encoding categorical variables. 

Then, we perform hyperparameter optimization and train a simple XGBoost model accordingly. After this, we can export the results to make a submission on the challenge data website. 

Keep in mind that this notebook is only the tip of the iceberg : we have built libraries to preprocess features and test different models (they are accessible in the models.py and features.py modules). After trial and error, this is the best model that we can come up with.

## Importing packages and data

In [16]:
import numpy as np
import polars as pl
import polars.selectors as cs
from utils import PathsData, import_data, split_data, plot_correlation
from features import FeaturesFrame
from models import Model, ModelEnum, ParamGridEnum

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor


x_validation = import_data(PathsData.X_TEST.value) #! to use in the end to make a submission
x = import_data(PathsData.X_TRAIN.value)
y = import_data(PathsData.Y_TRAIN.value)


## Removing outliers
After testing different methods, we have found that the one that yields the best results is simply based on a simple quantile exclusion.
We have also tested different values to understand which is the best quantile to exclude. 

We start by visualizing the outliers with boxplots.

arret,p2q0,p3q0,p4q0,p0q2,p0q3,p0q4,p0q0
i64,f64,f64,f64,f64,f64,f64,f64
8,0.0,0.0,1.0,-3.0,-1.0,-2.0,-1.0
9,0.0,0.0,0.0,1.0,0.0,1.0,-1.0
10,-1.0,0.0,0.0,-1.0,0.0,0.0,-1.0
11,-1.0,-1.0,0.0,2.0,-2.0,0.0,1.0
12,-1.0,-1.0,-1.0,-1.0,3.0,2.0,3.0
…,…,…,…,…,…,…,…
22,0.0,-1.0,-6.0,-1.0,0.0,-2.0,1.0
23,0.0,0.0,-1.0,1.0,0.0,0.0,2.0
24,1.0,0.0,0.0,-1.0,-3.0,-1.0,2.0
26,2.0,2.0,1.0,0.0,0.0,1.0,1.0


In [17]:
np.array(to_plot.select(to_plot.columns[0]).to_series().to_list())

array([ 8,  9, 10, ..., 24, 26, 28])

In [20]:


to_plot = x.with_columns(y).select(cs.numeric())

fig = make_subplots(rows=3, cols=3, subplot_titles=[c for c in to_plot.columns])

fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[0]).to_series().to_list()), boxpoints='all'), row=1, col=1)
fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[1]).to_series().to_list())), row=1, col=2)
fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[2]).to_series().to_list())), row=1, col=3)
fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[3]).to_series().to_list())), row=2, col=1)
fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[4]).to_series().to_list())), row=2, col=2)
fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[5]).to_series().to_list())), row=2, col=3)
fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[6]).to_series().to_list())), row=3, col=1)
fig.append_trace(go.Box(y=np.array(to_plot.select(to_plot.columns[7]).to_series().to_list())), row=3, col=2)


fig.update_layout(height=1000, width=1000, title_text="Outliers")



NameError: name 'μ' is not defined

In [None]:

#remove outliers:
x = x.with_columns(y).filter(pl.col('p0q0').ge(pl.col('p0q0').quantile(0.01)) & pl.col('p0q0').le(pl.col('p0q0').quantile(0.99)))
y = x.select('p0q0')
x = x.drop('p0q0')

x_train, x_test, y_train, y_test = split_data(x=x, y=y)

In [None]:
test_train = FeaturesFrame(x_train).encode_label(encoder='frequency').select(cs.numeric())#.add_feature_square().scale_standard() #.add_feature_interactions() #.add_feature_square() #.scale_standard(set="train")
test_test = FeaturesFrame(x_test).encode_label(encoder='frequency').select(cs.numeric()) #.add_feature_square().scale_standard(set='test', train_scaler=scaler) #.add_feature_interactions() #.add_feature_square() #.scale_standard(set="test", train_scaler=scaler)

In [None]:
model = Model(model_enum=ModelEnum.XGBoost)
model_fit = model.fit(test_train.to_numpy(), y_train=y_train.to_numpy())
model.evaluate(X_test=test_test.to_numpy(), y_test=y_test.to_numpy())

In [None]:
model = Model(model_enum=ModelEnum.XGBoost)
params_opti_mlp = model.optimize_hyperparams_optuna(test_train.to_pandas(), y_train=y_train.to_pandas())

In [None]:
model = Model(model_enum=ModelEnum.XGBoost)
params_opti_mlp = model.grid_search(test_train.to_pandas(), y_train=y_train.to_pandas(), param_grid = ParamGridEnum.XGBoost.value)

In [None]:
mod = XGBRegressor().set_params(**{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8})
mod_fit = mod.fit(test_train.to_numpy(),y_train.to_numpy())
preds = mod.predict(X=test_test.to_numpy())
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)
mae = mean_absolute_error(y_test,preds)
print(f"Model Evaluation - MSE: {mse:.4f}, R2: {r2:.4f}, MAE: {mae:.4f}")

In [None]:
def make_submission(number: int, model: XGBRegressor):
    predictions_validation = (
        model
        .predict(
            FeaturesFrame(x_validation)
            .encode_label(encoder='frequency')
            .drop('train', 'gare', 'date').to_numpy()
        )
    )
    (
        pl.DataFrame(predictions_validation)
        .rename({"column_0":"y_test"})
        .with_row_index()
        .write_csv(fr"C:\Users\faune\challenge-data-203\submissions\submission{number}.csv", separator=",")
    )
    print(f"Successfully saved submission number {number}")
    return None

In [None]:
make_submission(6, model=mod)