In [3]:
#Import package pandas for data analysis
import pandas as pd
# Import package numpy for numeric computing
import numpy as np
import seaborn as sns
# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

import pymysql

import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

import scipy.stats as ss

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [45]:
df = pd.read_csv('line_65_segments.csv', keep_default_na=True, delimiter=',')


In [46]:
df.head()
df.columns

Index(['Unnamed: 0', 'DAYOFSERVICE', 'TRIPID', 'LINEID', 'PROGRNUMBER',
       'STOPPOINTID', 'DIRECTION', 'ACTUALTIME_DEP', 'ACTUALTIME_ARR', 'hour',
       'dayofweek', 'journey_time', 'dwell_time', 'prev_stop_id',
       'prev_progrnumber', 'prev_dept_time', 'segment_id', 'rain', 'temp',
       'pressure', 'humidity', 'wind_speed', 'wind_dir', 'sun', 'visibility',
       'cloud_height', 'cloud_cover', 'holiday'],
      dtype='object')

In [47]:
# drop unrelated columns
df = df.drop(columns=['Unnamed: 0',"TRIPID", "LINEID", 'segment_id'])
df["DIRECTION"] = df["DIRECTION"].astype('category')

In [48]:
#change datatypes of some features
df['DAYOFSERVICE'] = df['DAYOFSERVICE'].astype('datetime64') #convert DAYOFSERVICE to datetime
df['DAYOFSERVICE']=df['DAYOFSERVICE'].apply(lambda x: x.toordinal()) #then convert it to numeric
df['dayofweek'] = df['dayofweek'].astype('category')
df['hour'] = df['hour'].astype('category')
df["STOPPOINTID"] = df["STOPPOINTID"].astype('category')
df.dtypes


DAYOFSERVICE           int64
PROGRNUMBER            int64
STOPPOINTID         category
DIRECTION           category
ACTUALTIME_DEP         int64
ACTUALTIME_ARR         int64
hour                category
dayofweek           category
journey_time           int64
dwell_time             int64
prev_stop_id           int64
prev_progrnumber       int64
prev_dept_time         int64
rain                 float64
temp                 float64
pressure             float64
humidity               int64
wind_speed             int64
wind_dir               int64
sun                  float64
visibility             int64
cloud_height           int64
cloud_cover            int64
holiday                int64
dtype: object

In [49]:
low_information_gain = ['rain',
                        'wind_dir',
                        'cloud_cover',
                        'wind_speed',
                        'pressure',
                        'humidity',
                        'sun',
                        "PROGRNUMBER",
                        "prev_dept_time",
                        "ACTUALTIME_DEP",
                        "prev_stop_id",
                        "prev_progrnumber",
                        "DAYOFSERVICE",
                        "DIRECTION"]
# drop the useless column
df_rev1 = df.copy()
# drop low value features
df_rev1.drop(low_information_gain, 1, inplace=True)

In [29]:
df_rev1.head(10)

Unnamed: 0,STOPPOINTID,ACTUALTIME_ARR,hour,dayofweek,journey_time,dwell_time,temp,visibility,cloud_height,holiday
0,4521,34797,9,0,130,13,4.6,30000,999,1
1,1283,34887,9,0,77,0,4.6,30000,999,1
2,4456,34926,9,0,39,0,4.6,30000,999,1
3,1284,34948,9,0,22,9,4.6,30000,999,1
4,1285,35009,9,0,52,0,4.6,30000,999,1
5,1016,35089,9,0,80,0,4.6,30000,999,1
6,1017,35155,9,0,66,0,4.6,30000,999,1
7,1018,35172,9,0,17,19,4.6,30000,999,1
8,1019,35205,9,0,14,0,4.6,30000,999,1
9,1020,35220,9,0,15,19,4.6,30000,999,1


In [32]:

import xgboost as xgb
df_rev2 = df.copy()
# drop low value features
df_rev2.drop(low_information_gain, 1, inplace=True)
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
results_dict = {"MAE": [],
                "RMSE": [],
                "R2": [],
                "cv_neg_mean_absolute_error": [],
                "cv_neg_mean_squared_error": [],
                "cv_r2": []}

stops = df.STOPPOINTID.unique()

for i, seg in enumerate(stops):
    print(i, seg)
    seg_df = df_rev2.copy()
    seg_df = seg_df[seg_df["STOPPOINTID"] == seg]
    seg_df.drop(["STOPPOINTID"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["dwell_time"]
    # X is everything else
    X = seg_df.drop(["dwell_time"],1)
    # Split the dataset into two datasets: 70% training and 30% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

    # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    # Train aka fit, a model using all continuous and categorical features.
    multiple_linreg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X_train, y_train)
    multiple_linreg_predictions_train = multiple_linreg.predict(X_train)
    
    results_dict["MAE"].append(metrics.mean_absolute_error(y_train, multiple_linreg_predictions_train))
    results_dict["RMSE"].append(metrics.mean_squared_error(y_train, multiple_linreg_predictions_train)**0.5)
    results_dict["R2"].append(metrics.r2_score(y_train, multiple_linreg_predictions_train))
                                           
    actual_vs_predicted_multiplelinreg = pd.concat([y_train, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
    print(actual_vs_predicted_multiplelinreg.head(10))
    scores = cross_validate(xgb.XGBRegressor(objective="reg:squarederror", random_state=42), X, y, scoring=metrics_list, cv=5)
    for metric in metrics_list:
        print( metric, np.average(scores["test_" + metric]) )
        results_dict["cv_" + metric].append(np.average(scores["test_" + metric]))
    print("================================================")

0 4521
   dwell_time  Predicted
0          19  11.985628
1          62  58.643070
2          37  18.940422
3          66  52.782867
4           0   4.638111
5           8  11.085460
6          64  60.716572
7          13  31.058226
8          10  13.021700
9          19  16.446367
neg_mean_absolute_error -22.48814085154715
neg_mean_squared_error -1344.4266938109886
r2 0.2647440697644482
1 1283
   dwell_time  Predicted
0          64  44.833366
1          15  17.244402
2          65  65.383125
3           0   3.408988
4          45  38.813984
5          41  42.184082
6          16   4.963711
7          56  55.066254
8          52  42.817352
9          19  19.244898
neg_mean_absolute_error -16.397639758485685
neg_mean_squared_error -964.2107934613844
r2 0.1839184338940483
2 4456
   dwell_time  Predicted
0           0   0.827250
1          20  20.214449
2           0  -0.144262
3           0   1.722318
4          64  36.226627
5           0   1.477304
6           0   5.305242
7          31

neg_mean_absolute_error -9.171174897859805
neg_mean_squared_error -162.34432084052116
r2 0.06175986497631225
19 1121
   dwell_time  Predicted
0          17  17.097275
1           0   9.410659
2          10   7.792371
3           0   4.751667
4           0   6.822461
5          13  10.697974
6          13  10.522409
7          18  12.362494
8          23  13.478311
9           0  12.549709
neg_mean_absolute_error -8.142555443313057
neg_mean_squared_error -136.84229329016404
r2 0.0010847090557372318
20 1122
   dwell_time  Predicted
0           0   0.101894
1           0   0.108877
2           0   0.039206
3           0  -0.157965
4           0   0.794313
5           0   0.039393
6           0   1.931656
7           0  -0.593841
8           0  -0.166379
9           0   0.038165
neg_mean_absolute_error -2.725208556260454
neg_mean_squared_error -24.152100606940028
r2 0.1119091126798687
21 1123
   dwell_time  Predicted
0           0  -0.311999
1           0  -0.305171
2           0  -0.26749

neg_mean_absolute_error -1.079792255057702
neg_mean_squared_error -11.773400521780204
r2 0.4295280982074858
38 4059
   dwell_time  Predicted
0           0  -0.003345
1           0   0.423809
2           0   0.216934
3           0  -0.046187
4           0  -0.081198
5           0   0.009952
6           9   8.457979
7           7   6.407725
8           0   0.053574
9          15  14.749403
neg_mean_absolute_error -1.4793003164670684
neg_mean_squared_error -14.6171791016663
r2 0.08258455392647665
39 4060
   dwell_time  Predicted
0           0  -0.044413
1           0   0.034339
2          14  12.678363
3           0  -0.026232
4           0   0.058287
5           0  -0.050888
6           0   0.749068
7           0   0.732430
8           0   1.039219
9           0  -0.055372
neg_mean_absolute_error -2.75946904015903
neg_mean_squared_error -212.82102198088756
r2 -6.186691781606209
40 4061
   dwell_time  Predicted
0           0   0.105071
1          14  10.502799
2           0   0.360591
3  

neg_mean_absolute_error -0.42455767127433336
neg_mean_squared_error -5.49118145349823
r2 -1.6111683412883937
57 2585
   dwell_time  Predicted
0           0  -0.057867
1           0   0.024450
2           0   0.079845
3           0  -0.055739
4           0  -0.076026
5           0   0.187232
6           0  -0.022509
7           0  -0.039530
8           0  -0.050061
9           0   0.004115
neg_mean_absolute_error -0.8034175081189618
neg_mean_squared_error -7.337940134683694
r2 -0.30435558524343126
58 2586
   dwell_time  Predicted
0           0  -0.034464
1           0  -0.943594
2           8   6.467253
3           0   0.052716
4           0   0.363478
5           0  -0.049126
6          10   6.112803
7           0   0.070566
8           0  -0.040998
9           0  -0.073103
neg_mean_absolute_error -2.0164996749749116
neg_mean_squared_error -21.982345945258764
r2 0.23992200405734784
59 2587
   dwell_time  Predicted
0           0  -0.045787
1           0   0.679334
2          15  14.6777

neg_mean_absolute_error -4.937587740591267
neg_mean_squared_error -53.46397075929921
r2 0.12092220898249588
76 2555
   dwell_time  Predicted
0          13  11.057341
1          12  12.194022
2          13  10.006370
3           0   1.814830
4          12  11.363626
5           0   0.450092
6           0   5.167846
7          12  11.531418
8          10  12.457413
9           0   4.191406
neg_mean_absolute_error -6.448915906724326
neg_mean_squared_error -72.27671159794048
r2 -0.027465293019723513
77 2556
   dwell_time  Predicted
0          16  18.316067
1          45  36.741467
2          14  12.296688
3          18  12.560248
4          13  10.099240
5           0   3.503910
6          18  17.631325
7          14  12.500084
8          33  32.529964
9           0   6.401228
neg_mean_absolute_error -8.732166313913684
neg_mean_squared_error -220.68398699594732
r2 -0.06356319395170532
78 2557
   dwell_time  Predicted
0          21  16.316128
1          15  16.371532
2          11  13.97497

neg_mean_absolute_error -3.5489131797218945
neg_mean_squared_error -186.53639677783303
r2 -4.186875832743742
95 4004
   dwell_time  Predicted
0           0   0.219498
1           0  -0.173319
2           0  -0.506404
3           0   0.044338
4           0   1.115553
5           0  -0.177984
6          11   6.744929
7           0   3.105542
8           0   0.155003
9           0   0.711937
neg_mean_absolute_error -2.3732577359377034
neg_mean_squared_error -15.859025364152279
r2 0.08125167253446738
96 4005
   dwell_time  Predicted
0           0   1.411938
1           9   8.068599
2           0   0.399254
3           0   1.274914
4           0  -0.222554
5          10  10.565848
6           0  -0.151238
7          11   8.288244
8           0   0.648678
9           0  -0.052692
neg_mean_absolute_error -1.5836875621438578
neg_mean_squared_error -10.268796449691186
r2 0.2277939276751348
97 4006
   dwell_time  Predicted
0           0  -0.301858
1          10   7.930977
2           0   0.84895

neg_mean_absolute_error -0.33110417494820626
neg_mean_squared_error -9.57539554144129
r2 -6.615610204602177
114 4021
   dwell_time  Predicted
0           0   0.038177
1           0  -0.067998
2           0   3.382888
3          10  10.791851
4           0  -0.046174
5           0   0.075732
6           0  -0.020579
7          12  12.640432
8           0  -0.082421
9           0   0.548004
neg_mean_absolute_error -1.7308642407829762
neg_mean_squared_error -25.45923614831821
r2 0.21293215289812037
115 4022
   dwell_time  Predicted
0           0   0.052446
1           0  -0.017086
2           0   0.370179
3           0   0.204347
4           0  -0.039422
5           0  -0.035416
6           9   9.224475
7           0  -0.833378
8           0  -0.136950
9           0  -0.007135
neg_mean_absolute_error -1.0611124154257572
neg_mean_squared_error -6.054008585522236
r2 0.3952293995077744
116 4023
   dwell_time  Predicted
0           0  -0.067676
1           0  -0.037558
2           0   0.09280

neg_mean_absolute_error -6.740384736733224
neg_mean_squared_error -89.45254673027196
r2 -0.13354770938078558
133 1157
   dwell_time  Predicted
0          10   7.966262
1          11  11.750759
2          64  43.110527
3          19  20.369507
4           0   2.476481
5          12  10.532814
6          25  19.671829
7           0   4.261415
8          12  14.262747
9           8  10.814214
neg_mean_absolute_error -9.55714940382054
neg_mean_squared_error -170.51847027151945
r2 -0.03794623536833932
134 1158
   dwell_time  Predicted
0          36  35.121132
1           0   2.518744
2           0  -0.179280
3           0   0.141582
4          12   4.796889
5           0  -0.522085
6          11   7.872243
7           0   0.375499
8           0   0.498087
9           0   2.754212
neg_mean_absolute_error -5.955694729499042
neg_mean_squared_error -71.878460201441
r2 0.1063684777600565
135 1159
   dwell_time  Predicted
0           0   0.497880
1           0   2.448954
2           0   0.926056


neg_mean_absolute_error -10.267669508429979
neg_mean_squared_error -264.96187185966585
r2 0.16973196838014035
152 1352
   dwell_time  Predicted
0          27  11.219386
1           0   2.952876
2          11  14.434750
3          13  14.872032
4           0   1.080304
5           0   9.914262
6          17  16.396729
7          22  19.439470
8          32  29.724365
9          17  16.709009
neg_mean_absolute_error -7.706019615514931
neg_mean_squared_error -154.18873790371225
r2 -0.23051789835464084
153 1353
   dwell_time  Predicted
0          15  11.551431
1          21  18.612150
2           0   0.958766
3          17   9.282383
4          29  15.942592
5           0   5.763946
6           0   6.298308
7          12  12.244824
8          20  14.763133
9          19  17.573599
neg_mean_absolute_error -8.609194431986912
neg_mean_squared_error -361.2172334465801
r2 -0.04424413148482986
154 1355
   dwell_time  Predicted
0          11  10.398706
1          18  15.355017
2          24  23.7

neg_mean_absolute_error -2.769996325566331
neg_mean_squared_error -24.77563535673936
r2 -0.5935926962663288
171 7263
   dwell_time  Predicted
0           0   0.015589
1          11  10.983869
2           0  -0.002432
3          13  12.966723
4           0   0.023126
5           0  -0.003615
6           9   9.004711
7           0  -0.020803
8           0   0.022110
9          17  16.936298
neg_mean_absolute_error -3.412033908091974
neg_mean_squared_error -41.36764055560674
r2 -0.33785121471523605
172 7264
   dwell_time  Predicted
0           0   0.050445
1           7   6.977931
2           0  -0.001116
3           0   0.050534
4           0   0.037249
5           0  -0.016330
6           7   6.980910
7           0  -0.028133
8          15  14.996543
9           0   0.016196
neg_mean_absolute_error -3.4393677174970017
neg_mean_squared_error -61.46006942085351
r2 -0.33243349297309777
173 7268
   dwell_time  Predicted
0           0   0.039146
1          10   9.900401
2           0   0.003

In [34]:
for m in results_dict:
    results_dict[m] = np.average(results_dict[m])
results_dict

{'MAE': 1.6724124293623337,
 'RMSE': 2.4576358198304513,
 'R2': 0.9275958392762017,
 'cv_neg_mean_absolute_error': -6.197141023493632,
 'cv_neg_mean_squared_error': -1311.4497385389773,
 'cv_r2': -0.9494869623152908}

In [50]:
# drop the useless column
df_rev1 = df.copy()
# drop low value features
df_rev1.drop(low_information_gain, 1, inplace=True)

In [54]:

df_rev1 = pd.get_dummies(df_rev1)
df_rev1.dtypes.value_counts()

uint8      207
int64        6
float64      1
dtype: int64

In [55]:
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
results_dict = {"MAE": [],
                "RMSE": [],
                "R2": [],
                "cv_neg_mean_absolute_error": [],
                "cv_neg_mean_squared_error": [],
                "cv_r2": []}

# y is the target
y = df_rev1["dwell_time"]
# X is everything else
X = df_rev1.drop(["dwell_time"],1)
# Split the dataset into two datasets: 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

# need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Train aka fit, a model using all continuous and categorical features.
multiple_linreg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X_train, y_train)
multiple_linreg_predictions_train = multiple_linreg.predict(X_train)

results_dict["MAE"].append(metrics.mean_absolute_error(y_train, multiple_linreg_predictions_train))
results_dict["RMSE"].append(metrics.mean_squared_error(y_train, multiple_linreg_predictions_train)**0.5)
results_dict["R2"].append(metrics.r2_score(y_train, multiple_linreg_predictions_train))

actual_vs_predicted_multiplelinreg = pd.concat([y_train, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
print(actual_vs_predicted_multiplelinreg.head(10))
scores = cross_validate(xgb.XGBRegressor(objective="reg:squarederror", random_state=42), X, y, scoring=metrics_list, cv=5)
for metric in metrics_list:
    print( metric, np.average(scores["test_" + metric]) )
    results_dict["cv_" + metric].append(np.average(scores["test_" + metric]))
print("================================================")

   dwell_time  Predicted
0          11   7.071722
1           0   1.751993
2           0   2.765282
3          10   9.346604
4           0   3.014282
5          26  17.010691
6           0   6.686565
7          11   9.804345
8           0   1.984495
9          65  38.277916
neg_mean_absolute_error -5.79888501924283
neg_mean_squared_error -343.3852848994117
r2 0.49445618476673214


In [56]:
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
results_dict = {"MAE": [],
                "RMSE": [],
                "R2": [],
                "cv_neg_mean_absolute_error": [],
                "cv_neg_mean_squared_error": [],
                "cv_r2": []}

# y is the target
y = df_rev1["dwell_time"]
# X is everything else
X = df_rev1.drop(["dwell_time"],1)

# Train aka fit, a model using all continuous and categorical features.
multiple_linreg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X_train, y_train)
multiple_linreg_predictions_train = multiple_linreg.predict(X)



In [57]:
multiple_linreg_predictions_train

array([22.881056 , 11.180227 ,  4.5320354, ...,  2.5123992, 36.831997 ,
        0.7453068], dtype=float32)

In [60]:
df["dwell_predictions"] = multiple_linreg_predictions_train

In [64]:
df.to_csv("out.csv")