In [1]:
#Import package pandas for data analysis
import pandas as pd
# Import package numpy for numeric computing
import numpy as np
import seaborn as sns
# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

import pymysql

import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

import scipy.stats as ss

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

import pickle
import random

In [2]:
df = pd.read_csv('line_65_segments.csv', keep_default_na=True, delimiter=',')
segs_to_drop = list((df["segment_id"].value_counts()[df["segment_id"].value_counts() < 100]).index)

df = df.query(f'segment_id not in {segs_to_drop}')
df = df.drop(columns=['Unnamed: 0', "LINEID"])
df["DIRECTION"] = df["DIRECTION"].astype('category')
#change datatypes of some features
df['DAYOFSERVICE'] = df['DAYOFSERVICE'].astype('datetime64') #convert DAYOFSERVICE to datetime
df['DAYOFSERVICE']=df['DAYOFSERVICE'].apply(lambda x: x.toordinal()) #then convert it to numeric
df['dayofweek'] = df['dayofweek'].astype('category')
df['hour'] = df['hour'].astype('category')

# Drop trips that are not full journeys
grouped_df = df.groupby(["DAYOFSERVICE","TRIPID"])
trips_to_keep1 = grouped_df.first()[grouped_df["PROGRNUMBER"].first() == 2 ].index.to_list()
trips_to_keep2 = grouped_df.last()[(grouped_df["PROGRNUMBER"].last() == 77) |( grouped_df["PROGRNUMBER"].last() == 74)].index.to_list()
trips_to_keep = trips_to_keep1 + trips_to_keep2
df = df.reset_index(drop=True)
df = df[pd.Series(list(zip(df['DAYOFSERVICE'], df['TRIPID']))).isin(trips_to_keep)]


In [3]:
df = df.reset_index(drop=True)
trips = list(grouped_df.first().index)
k = len(trips) // 3
random.seed(17)
trips_to_test = random.sample(trips, k)
train_df = df[~pd.Series(list(zip(df['DAYOFSERVICE'], df['TRIPID']))).isin(trips_to_test)]
test_df = df[pd.Series(list(zip(df['DAYOFSERVICE'], df['TRIPID']))).isin(trips_to_test)]

In [4]:
import xgboost as xgb
low_information_gain = ['rain',
                        'wind_dir',
                        'cloud_cover',
                        'wind_speed',
                        'pressure',
                        'humidity',
                        'sun',
                        "PROGRNUMBER",
                        "ACTUALTIME_DEP",
                        "ACTUALTIME_ARR",
                        "dwell_time",
                        "prev_stop_id",
                        "prev_progrnumber",
                        "DAYOFSERVICE",
                        "DIRECTION",
                        "TRIPID",
                        "STOPPOINTID"]
df_rev1 = train_df.copy()
# drop low value features
df_rev1.drop(low_information_gain, 1, inplace=True)

segment_models = {}

segments = df_rev1.segment_id.unique()

for i, seg in enumerate(segments):
    print(i, seg)
    seg_df = df_rev1.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)

    # Train aka fit, a model using all continuous and categorical features.
    segment_models[seg] = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X, y)

0 7564-4521
1 4521-1283
2 1283-4456
3 4456-1284
4 1284-1285
5 1285-1016
6 1016-1017
7 1017-1018
8 1018-1019
9 1019-1020
10 1020-1076
11 1076-1077
12 1077-1078
13 1078-1079
14 1079-1080
15 1080-1081
16 1081-1082
17 1082-1083
18 1083-1085
19 1085-1121
20 1121-1122
21 1122-1123
22 1123-1124
23 1124-1125
24 1125-1127
25 1127-1130
26 1130-2550
27 2550-2551
28 2551-2553
29 2553-2554
30 5111-4051
31 4051-4052
32 4052-4054
33 4054-4055
34 4055-4057
35 4057-4101
36 4101-4102
37 4102-4058
38 4058-4059
39 4059-4060
40 4060-4061
41 4061-4062
42 4062-4063
43 4063-4064
44 4064-4056
45 4056-4133
46 4133-4065
47 4065-4066
48 4066-4067
49 4067-4068
50 4068-4069
51 4069-4070
52 4070-4071
53 4071-4072
54 4072-4073
55 4073-2583
56 2583-2584
57 2584-2585
58 2585-2586
59 2586-2587
60 2587-2588
61 2588-2589
62 2589-2590
63 2590-2591
64 2591-2592
65 2592-2357
66 2357-2358
67 2358-2359
68 2359-2360
69 2360-4348
70 4348-4646
71 4646-4647
72 4647-2346
73 2346-4435
74 4435-2594
75 2594-2595
76 2554-2555
77 2555-2

In [5]:
def get_prediction(segment, x):
    try:
        model = segment_models[segment]
        return model.predict(x)
    except:
        print("No model for", segment)
        return -999
    
df_rev1 = test_df.copy()
df_rev1.drop(low_information_gain, 1, inplace=True)
predictions = {}
for seg in df_rev1.segment_id.unique():
    print(".", end="")
    seg_df = df_rev1.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)
    seg_df["predicted_journey"] = get_prediction(seg, X)
    predictions[seg] = seg_df["predicted_journey"]

result = pd.concat(list(predictions.values()))

..........................................................................................................................................................................................

In [6]:
test_df = test_df.join(result) 
test_df

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,DIRECTION,ACTUALTIME_DEP,ACTUALTIME_ARR,hour,dayofweek,journey_time,...,pressure,humidity,wind_speed,wind_dir,sun,visibility,cloud_height,cloud_cover,holiday,predicted_journey
158,736695,5956288,2,7280,2,41414,41414,11,0,28,...,7.7,7,22,250,0.5,30000,200,4,1,36.249516
159,736695,5956288,3,7281,2,41476,41476,11,0,62,...,7.7,7,22,250,0.5,30000,200,4,1,60.610733
160,736695,5956288,4,7284,2,41529,41529,11,0,53,...,7.7,7,22,250,0.5,30000,200,4,1,57.915367
161,736695,5956288,5,7287,2,41611,41611,11,0,82,...,7.7,7,22,250,0.5,30000,200,4,1,78.468460
162,736695,5956288,6,7208,2,41708,41708,11,0,97,...,7.7,7,22,250,0.5,30000,200,4,1,99.200272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633023,737059,8589898,71,4022,1,28180,28180,7,0,23,...,9.1,9,6,230,0.0,30000,25,7,0,23.421541
633024,737059,8589898,72,4023,1,28214,28214,7,0,34,...,9.1,9,6,230,0.0,30000,25,7,0,38.804291
633025,737059,8589898,73,4024,1,28233,28233,7,0,19,...,9.1,9,6,230,0.0,30000,25,7,0,22.674963
633026,737059,8589898,74,4027,1,28276,28276,7,0,43,...,9.1,9,6,230,0.0,30000,25,7,0,48.276489


In [7]:
low_information_gain = ['rain',
                        'wind_dir',
                        'cloud_cover',
                        'wind_speed',
                        'pressure',
                        'humidity',
                        'sun',
                        "PROGRNUMBER",
                        "prev_dept_time",
                        "ACTUALTIME_DEP",
                        "prev_stop_id",
                        "prev_progrnumber",
                        "DAYOFSERVICE",
                        "DIRECTION",
                        "segment_id"]
# drop the useless column
df_rev2 = train_df.copy()
# drop low value features
df_rev2.drop(low_information_gain, 1, inplace=True)
df_rev2 = pd.get_dummies(df_rev2)
# y is the target
y = df_rev2["dwell_time"]
# X is everything else
X = df_rev2.drop(["dwell_time"],1)


# Train aka fit, a model using all continuous and categorical features.
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X, y)



In [8]:
# drop the useless column
df_rev3 = test_df.copy()
# drop low value features
df_rev3.drop(low_information_gain, 1, inplace=True)
df_rev3 = pd.get_dummies(df_rev3)

X = df_rev3.drop(["dwell_time","predicted_journey"],1)

xgb_reg_dwell_predictions = xgb_reg.predict(X)

In [9]:
test_df["predicted_dwell"] = xgb_reg_dwell_predictions

In [10]:
df_predictions = test_df.copy()

In [11]:
grouped_df = df_predictions.groupby(["DAYOFSERVICE","TRIPID"])

df_predictions['total_journey'] = grouped_df["journey_time"].transform('sum')
df_predictions['total_predicted_journey'] = grouped_df["predicted_journey"].transform('sum')

df_predictions['total_dwell'] = grouped_df["dwell_time"].transform('sum')
df_predictions['total_predicted_dwell'] = grouped_df["predicted_dwell"].transform('sum')

df_predictions["overall_time"] = df_predictions["total_journey"] + df_predictions["total_dwell"]

df_predictions["overall_predicted_time"] = df_predictions["total_predicted_journey"] + df_predictions["total_predicted_dwell"]

In [12]:
df_predictions["difference"] = df_predictions["overall_predicted_time"] - df_predictions["overall_time"] 

In [13]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

mae = mean_absolute_error(df_predictions["overall_time"], df_predictions["overall_predicted_time"])
rmse = mean_squared_error(df_predictions["overall_time"], df_predictions["overall_predicted_time"], squared=False)
r2 = r2_score(df_predictions["overall_time"], df_predictions["overall_predicted_time"])

In [14]:
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 329.9186482082103
RMSE: 494.79555712393346
R2: 0.839342279358107


In [15]:
rmse

494.79555712393346