# Import

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import helper

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor

# Load Dataset

In [18]:
df_regression = pd.read_csv("../processed_files/df_for_ml.csv", index_col=[0])

In [19]:
df_regression.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250637 entries, 0 to 250636
Data columns (total 31 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trip_id                250637 non-null  int64  
 1   arrival_time           250637 non-null  int64  
 2   departure_time         250637 non-null  int64  
 3   line_index             250637 non-null  int64  
 4   shape_dist_traveled    250637 non-null  float64
 5   time_diff              250637 non-null  float64
 6   speed                  250637 non-null  float64
 7   dist_diff              250637 non-null  float64
 8   code                   250637 non-null  int64  
 9   direction_id           250637 non-null  int64  
 10  day_date               250637 non-null  int64  
 11  stop_code_d            250637 non-null  int64  
 12  stop_code_a            250637 non-null  int64  
 13  departure_delay        250637 non-null  int64  
 14  arrival_delay          250637 non-null  i

In [20]:
# Drop column on wich i don't want to train
df_regression.drop(columns=["trip_id",
                 "line_index",
                 "code",
                 "day_date"], inplace=True)
df_regression.shape

(250637, 27)

In [21]:
# We only need to train on delayed trains on line
df_regression = df_regression[df_regression['arrival_delay'] > 0]
df_regression.reset_index(inplace=True, drop=True)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_regression.loc[:, df_regression.columns != "arrival_delay"].values , df_regression["arrival_delay"].values , test_size=0.20, random_state=10)
# To avoid excessive memory usage we delete the dataset
del df_regression

In [23]:
print(f"X_train: ",{X_train.shape})
print(f"X_test: ",{X_test.shape})
print(f"y_train: ",{y_train.shape})
print(f"y_test: ",{y_test.shape})

X_train:  {(156210, 26)}
X_test:  {(39053, 26)}
y_train:  {(156210,)}
y_test:  {(39053,)}


In [8]:
# We initialize a dataset to easily call for see statistics
statistics = pd.DataFrame(columns=['regressor','mse','rmse','mae','r2'])


# Regression

## Linear Regression

In [24]:
model = LinearRegression(n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/LinearRegression.joblib")
# model = joblib.load("./models/regressions/LinearRegression.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "Linear", statistics)

del model
del y_pred

  regressor       mse      rmse       mae        r2
0    Linear  2.393458  1.547081  0.811484  0.901024


## Decision Tree Regressor

In [10]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/DecisionTreeRegressor.joblib")
#model = joblib.load("./models/regressions/DecisionTreeRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "DecisionTree", statistics)
del model
del y_pred

      regressor       mse      rmse       mae        r2
0  DecisionTree  5.015856  2.239611  0.678614  0.779608


## Random Forest

In [25]:
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/RandomForestRegressor.joblib")
#model = joblib.load("./models/regressions/RandomForestRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "RandomForest", statistics)
del model
del y_pred

      regressor      mse      rmse       mae        r2
0  RandomForest  2.46109  1.568786  0.529998  0.898227


## Extra Trees Regressor

In [12]:
model = ExtraTreesRegressor(n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/ExtraTreesRegressor.joblib")
#model = joblib.load("./models/regressions/ExtraTreesRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "ExtraTrees", statistics)
copy_X = y_pred.copy()
del model
del y_pred

    regressor       mse      rmse       mae        r2
0  ExtraTrees  2.304844  1.518171  0.545726  0.898727


## Gradient Boosting Regressor

In [13]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/GradientBoostingRegressor.joblib")
#model = joblib.load("./models/regressions/GradientBoostingRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "GradientBoosting", statistics)
del model
del y_pred

          regressor       mse      rmse       mae        r2
0  GradientBoosting  2.417967  1.554981  0.659275  0.893757


## Bagging Regressor


In [14]:
model = BaggingRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/BaggingRegressor.joblib")
#model = joblib.load("./models/regressions/BaggingRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "Bagging", statistics)
del model
del y_pred

  regressor       mse      rmse       mae        r2
0   Bagging  2.724607  1.650638  0.569013  0.880283


## AdaBoostRegressor


In [15]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/AdaBoostRegressor.joblib")
#model = joblib.load("./models/regressions/AdaBoostRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "AdaBoost", statistics)
del model
del y_pred

  regressor        mse      rmse       mae        r2
0  AdaBoost  35.931468  5.994286  3.997358 -0.578798


# Result

In [16]:
statistics.head(10)

Unnamed: 0,regressor,mse,rmse,mae,r2
0,Linear,3.574039,1.890513,0.825628,0.84296
1,DecisionTree,5.015856,2.239611,0.678614,0.779608
2,RandomForest,2.474592,1.573084,0.541915,0.891269
3,ExtraTrees,2.304844,1.518171,0.545726,0.898727
4,GradientBoosting,2.417967,1.554981,0.659275,0.893757
5,Bagging,2.724607,1.650638,0.569013,0.880283
6,AdaBoost,35.931468,5.994286,3.997358,-0.578798
