# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import helper

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor

# Load Dataset

In [2]:
df_regression = pd.read_csv("../processed_files/df_for_ml.csv", index_col=[0])

In [3]:
df_regression.info()

<class 'pandas.core.frame.DataFrame'>
Index: 251059 entries, 0 to 251058
Data columns (total 31 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   trip_id                 251059 non-null  int64  
 1   arrival_time            251059 non-null  int64  
 2   departure_time          251059 non-null  int64  
 3   indice_tratta           251059 non-null  int64  
 4   shape_dist_traveled     251059 non-null  float64
 5   time_diff               251059 non-null  float64
 6   speed                   251059 non-null  float64
 7   dist_diff               251059 non-null  float64
 8   codice                  251059 non-null  int64  
 9   direction_id            251059 non-null  int64  
 10  data_giorno             251059 non-null  int64  
 11  stop_code_p             251059 non-null  int64  
 12  stop_code_a             251059 non-null  int64  
 13  ritardo_partenza        251059 non-null  int64  
 14  ritardo_arrivo          2

In [4]:
# Drop column on wich i don't want to train
df_regression.drop(columns=["trip_id",
                 "indice_tratta",
                 "codice",
                 "data_giorno"], inplace=True)
df_regression.shape

(251059, 27)

In [5]:
# We only need to train on delayed trains on line
df_regression = df_regression[df_regression['ritardo_arrivo'] > 0]
df_regression.reset_index(inplace=True, drop=True)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_regression.loc[:, df_regression.columns != "ritardo_arrivo"].values , df_regression["ritardo_arrivo"].values , test_size=0.20, random_state=1)
# To avoid excessive memory usage we delete the dataset
del df_regression

In [7]:
print(f"X_train: ",{X_train.shape})
print(f"X_train: ",{X_test.shape})
print(f"X_train: ",{y_train.shape})
print(f"X_train: ",{y_test.shape})

X_train:  {(156459, 26)}
X_train:  {(39115, 26)}
X_train:  {(156459,)}
X_train:  {(39115,)}


In [8]:
# We initialize a dataset to easily call for see statistics
statistics = pd.DataFrame(columns=['regressor','mse','rmse','mae','r2'])


# Regression

## Linear Regression

In [9]:
model = LinearRegression(n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/LinearRegression.joblib")
# model = joblib.load("./models/regressions/LinearRegression.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "Linear", statistics)

del model
del y_pred

  regressor      mse     rmse       mae        r2
0    Linear  2.63267  1.62255  0.730399  0.892425


## Decision Tree Regressor

In [10]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/DecisionTreeRegressor.joblib")
#model = joblib.load("./models/regressions/DecisionTreeRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "DecisionTree", statistics)
del model
del y_pred

      regressor       mse     rmse       mae        r2
0  DecisionTree  3.854506  1.96329  0.510418  0.842499


## Random Forest

In [11]:
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/RandomForestRegressor.joblib")
#model = joblib.load("./models/regressions/RandomForestRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "RandomForest", statistics)
del model
del y_pred

      regressor       mse      rmse       mae        r2
0  RandomForest  1.830179  1.352841  0.415898  0.925216


## Extra Trees Regressor

In [12]:
model = ExtraTreesRegressor(n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/ExtraTreesRegressor.joblib")
#model = joblib.load("./models/regressions/ExtraTreesRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "ExtraTrees", statistics)
del model
del y_pred

    regressor       mse      rmse       mae        r2
0  ExtraTrees  1.695376  1.302066  0.418718  0.930724


## Gradient Boosting Regressor

In [13]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/GradientBoostingRegressor.joblib")
#model = joblib.load("./models/regressions/GradientBoostingRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "GradientBoosting", statistics)
del model
del y_pred

          regressor       mse      rmse     mae        r2
0  GradientBoosting  1.779241  1.333882  0.5391  0.927297


## Bagging Regressor


In [14]:
model = BaggingRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/BaggingRegressor.joblib")
#model = joblib.load("./models/regressions/BaggingRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "Bagging", statistics)
del model
del y_pred

  regressor       mse      rmse       mae        r2
0   Bagging  1.840564  1.356674  0.435751  0.924792


## AdaBoostRegressor


In [15]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)
joblib.dump(model, "./models/regressions/AdaBoostRegressor.joblib")
#model = joblib.load("./models/regressions/AdaBoostRegressor.joblib")
y_pred = model.predict(X_test)
statistics = helper.regressor_metrics(y_test, y_pred, "AdaBoost", statistics)
del model
del y_pred

  regressor        mse      rmse       mae        r2
0  AdaBoost  53.627486  7.323079  5.748684 -1.191304


In [16]:
statistics.head(10)

Unnamed: 0,regressor,mse,rmse,mae,r2
0,Linear,2.63267,1.62255,0.730399,0.892425
1,DecisionTree,3.854506,1.96329,0.510418,0.842499
2,RandomForest,1.830179,1.352841,0.415898,0.925216
3,ExtraTrees,1.695376,1.302066,0.418718,0.930724
4,GradientBoosting,1.779241,1.333882,0.5391,0.927297
5,Bagging,1.840564,1.356674,0.435751,0.924792
6,AdaBoost,53.627486,7.323079,5.748684,-1.191304


In [17]:
model = joblib.load("./models/regressions/RandomForestRegressor.joblib")
predetta = model.predict(X_test)


In [18]:
predetta

array([ 6.79, 10.91,  2.  , ...,  3.  ,  1.  ,  4.63])

In [19]:
y_test

array([ 8, 11,  2, ...,  3,  1,  4])

In [20]:
# GRAFICO RISULTATO PREDIZIONE

In [21]:
diff = predetta - y_test 

In [22]:
diff

array([-1.21, -0.09,  0.  , ...,  0.  ,  0.  ,  0.63])

In [23]:
diff.mean()

0.01646018151604244

In [24]:
X = np.sort(diff)

In [25]:
df = pd.DataFrame(X, columns = ['Value'])
df.to_csv("x.csv")