In [87]:
#Import package pandas for data analysis
import pandas as pd
# Import package numpy for numeric computing
import numpy as np
import seaborn as sns
# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

import pymysql

import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

import scipy.stats as ss

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

import pickle

In [61]:
df = pd.read_csv('line_65_segments.csv', keep_default_na=True, delimiter=',')
segs_to_drop = list((df["segment_id"].value_counts()[df["segment_id"].value_counts() < 100]).index)

df = df.query(f'segment_id not in {segs_to_drop}')
df = df.drop(columns=['Unnamed: 0', "LINEID"])
df["DIRECTION"] = df["DIRECTION"].astype('category')
#change datatypes of some features
df['DAYOFSERVICE'] = df['DAYOFSERVICE'].astype('datetime64') #convert DAYOFSERVICE to datetime
df['DAYOFSERVICE']=df['DAYOFSERVICE'].apply(lambda x: x.toordinal()) #then convert it to numeric
df['dayofweek'] = df['dayofweek'].astype('category')
df['hour'] = df['hour'].astype('category')


In [63]:
grouped_df = df.groupby(["DAYOFSERVICE","TRIPID"])
trips_to_keep1 = grouped_df.first()[grouped_df["PROGRNUMBER"].first() == 2 ].index.to_list()
trips_to_keep2 = grouped_df.last()[(grouped_df["PROGRNUMBER"].last() == 77) |( grouped_df["PROGRNUMBER"].last() == 74)].index.to_list()

In [64]:
trips_to_keep = trips_to_keep1 + trips_to_keep2
df = df.reset_index(drop=True)
df = df[pd.Series(list(zip(df['DAYOFSERVICE'], df['TRIPID']))).isin(trips_to_keep)]

In [81]:
grouped_df = df.groupby(["DAYOFSERVICE","TRIPID"])
df['total_journey'] = grouped_df["journey_time"].transform('sum')
df['total_dwell'] = grouped_df["dwell_time"].transform('sum')
df["overall_time"] = df["total_journey"] + df["total_dwell"]

In [91]:
low_information_gain = ['rain',
                        'wind_dir',
                        'cloud_cover',
                        'wind_speed',
                        'pressure',
                        'humidity',
                        'sun',
                        "PROGRNUMBER",
                        "ACTUALTIME_DEP",
                        "ACTUALTIME_ARR",
                        "dwell_time",
                        "prev_stop_id",
                        "prev_progrnumber",
                        "DAYOFSERVICE",
                        "DIRECTION",
                        "TRIPID",
                        "STOPPOINTID", 
                        "overall_time",
                        "total_dwell",
                        "total_journey"]

In [93]:
def get_prediction(segment, x):
    with open(f'C:/Users/cls15/Google Drive/Comp Sci/Research Practicum/Code/dublin-bus-app/DataAnalytics/Conor/pickels/{segment}.pickle', 'rb') as f:
        model = pickle.load(f)
        return model.predict(x)



df_rev1 = df.copy()
df_rev1.drop(low_information_gain, 1, inplace=True)
predictions = {}
for seg in df.segment_id.unique():
    seg_df = df_rev1.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)
    seg_df["predicted_journey"] = get_prediction(seg, X)
    predictions[seg] = seg_df["predicted_journey"]

result = pd.concat(list(predictions.values()))


Unnamed: 0,hour,dayofweek,journey_time,prev_dept_time,segment_id,temp,visibility,cloud_height,holiday,predicted_journey,DAYOFSERVICE,PROGRNUMBER
0,9,0,130,34667,7564-4521,4.6,30000,999,1,145.713287,736695,2
1,9,0,77,34810,4521-1283,4.6,30000,999,1,109.635406,736695,3
2,9,0,39,34887,1283-4456,4.6,30000,999,1,43.710220,736695,4
3,9,0,22,34926,4456-1284,4.6,30000,999,1,31.118746,736695,5
4,9,0,52,34957,1284-1285,4.6,30000,999,1,64.406311,736695,6
...,...,...,...,...,...,...,...,...,...,...,...,...
738182,7,0,23,28157,4021-4022,9.6,30000,25,0,25.403379,737059,71
738183,7,0,34,28180,4022-4023,9.6,30000,25,0,36.691921,737059,72
738184,7,0,19,28214,4023-4024,9.6,30000,25,0,23.014582,737059,73
738185,7,0,43,28233,4024-4027,9.6,30000,25,0,51.870636,737059,74


In [96]:

df_predictions = df.join(result) 

df_predictions

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,DIRECTION,ACTUALTIME_DEP,ACTUALTIME_ARR,hour,dayofweek,journey_time,...,wind_dir,sun,visibility,cloud_height,cloud_cover,holiday,total_journey,total_dwell,overall_time,predicted_journey
0,736695,5956287,2,4521,1,34810,34797,9,0,130,...,240,0.2,30000,999,3,1,3458,224,3682,145.713287
1,736695,5956287,3,1283,1,34887,34887,9,0,77,...,240,0.2,30000,999,3,1,3458,224,3682,109.635406
2,736695,5956287,4,4456,1,34926,34926,9,0,39,...,240,0.2,30000,999,3,1,3458,224,3682,43.710220
3,736695,5956287,5,1284,1,34957,34948,9,0,22,...,240,0.2,30000,999,3,1,3458,224,3682,31.118746
4,736695,5956287,6,1285,1,35009,35009,9,0,52,...,240,0.2,30000,999,3,1,3458,224,3682,64.406311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738182,737059,8589898,71,4022,1,28180,28180,7,0,23,...,230,0.0,30000,25,7,0,2750,272,3022,25.403379
738183,737059,8589898,72,4023,1,28214,28214,7,0,34,...,230,0.0,30000,25,7,0,2750,272,3022,36.691921
738184,737059,8589898,73,4024,1,28233,28233,7,0,19,...,230,0.0,30000,25,7,0,2750,272,3022,23.014582
738185,737059,8589898,74,4027,1,28276,28276,7,0,43,...,230,0.0,30000,25,7,0,2750,272,3022,51.870636


In [98]:
grouped_df = df_predictions.groupby(["DAYOFSERVICE","TRIPID"])
df_predictions['total_journey'] = grouped_df["journey_time"].transform('sum')
df_predictions['total_predicted_journey'] = grouped_df["predicted_journey"].transform('sum')
df_predictions['total_dwell'] = grouped_df["dwell_time"].transform('sum')
df_predictions["overall_time"] = df_predictions["total_journey"] + df_predictions["total_dwell"]


In [99]:
df_predictions

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,DIRECTION,ACTUALTIME_DEP,ACTUALTIME_ARR,hour,dayofweek,journey_time,...,sun,visibility,cloud_height,cloud_cover,holiday,total_journey,total_dwell,overall_time,predicted_journey,total_predicted_journey
0,736695,5956287,2,4521,1,34810,34797,9,0,130,...,0.2,30000,999,3,1,3458,224,3682,145.713287,3672.398438
1,736695,5956287,3,1283,1,34887,34887,9,0,77,...,0.2,30000,999,3,1,3458,224,3682,109.635406,3672.398438
2,736695,5956287,4,4456,1,34926,34926,9,0,39,...,0.2,30000,999,3,1,3458,224,3682,43.710220,3672.398438
3,736695,5956287,5,1284,1,34957,34948,9,0,22,...,0.2,30000,999,3,1,3458,224,3682,31.118746,3672.398438
4,736695,5956287,6,1285,1,35009,35009,9,0,52,...,0.2,30000,999,3,1,3458,224,3682,64.406311,3672.398438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738182,737059,8589898,71,4022,1,28180,28180,7,0,23,...,0.0,30000,25,7,0,2750,272,3022,25.403379,2953.987549
738183,737059,8589898,72,4023,1,28214,28214,7,0,34,...,0.0,30000,25,7,0,2750,272,3022,36.691921,2953.987549
738184,737059,8589898,73,4024,1,28233,28233,7,0,19,...,0.0,30000,25,7,0,2750,272,3022,23.014582,2953.987549
738185,737059,8589898,74,4027,1,28276,28276,7,0,43,...,0.0,30000,25,7,0,2750,272,3022,51.870636,2953.987549


In [102]:
df = pd.read_csv('out.csv', keep_default_na=True, delimiter=',')

In [104]:
df = df.drop(["Unnamed: 0"], 1)

In [110]:
df["dwell_predictions"]

0         22.881056
1         11.180227
2          4.532035
3          9.873302
4         10.247228
            ...    
738657     3.246735
738658    -0.046151
738659     2.512399
738660    36.831997
738661     0.745307
Name: dwell_predictions, Length: 738662, dtype: float64

In [113]:
df_predictions = df_predictions.join(df["dwell_predictions"])

In [114]:
df_predictions

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,DIRECTION,ACTUALTIME_DEP,ACTUALTIME_ARR,hour,dayofweek,journey_time,...,visibility,cloud_height,cloud_cover,holiday,total_journey,total_dwell,overall_time,predicted_journey,total_predicted_journey,dwell_predictions
0,736695,5956287,2,4521,1,34810,34797,9,0,130,...,30000,999,3,1,3458,224,3682,145.713287,3672.398438,22.881056
1,736695,5956287,3,1283,1,34887,34887,9,0,77,...,30000,999,3,1,3458,224,3682,109.635406,3672.398438,11.180227
2,736695,5956287,4,4456,1,34926,34926,9,0,39,...,30000,999,3,1,3458,224,3682,43.710220,3672.398438,4.532035
3,736695,5956287,5,1284,1,34957,34948,9,0,22,...,30000,999,3,1,3458,224,3682,31.118746,3672.398438,9.873302
4,736695,5956287,6,1285,1,35009,35009,9,0,52,...,30000,999,3,1,3458,224,3682,64.406311,3672.398438,10.247228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738182,737059,8589898,71,4022,1,28180,28180,7,0,23,...,30000,25,7,0,2750,272,3022,25.403379,2953.987549,8.070208
738183,737059,8589898,72,4023,1,28214,28214,7,0,34,...,30000,25,7,0,2750,272,3022,36.691921,2953.987549,4.837013
738184,737059,8589898,73,4024,1,28233,28233,7,0,19,...,30000,25,7,0,2750,272,3022,23.014582,2953.987549,4.803165
738185,737059,8589898,74,4027,1,28276,28276,7,0,43,...,30000,25,7,0,2750,272,3022,51.870636,2953.987549,0.574702


In [116]:
grouped_df = df_predictions.groupby(["DAYOFSERVICE","TRIPID"])
df_predictions['total_journey'] = grouped_df["journey_time"].transform('sum')
df_predictions['total_predicted_journey'] = grouped_df["predicted_journey"].transform('sum')

df_predictions['total_dwell'] = grouped_df["dwell_time"].transform('sum')
df_predictions['total_predicted_dwell'] = grouped_df["dwell_predictions"].transform('sum')

df_predictions["overall_time"] = df_predictions["total_journey"] + df_predictions["total_dwell"]

df_predictions["overall_predicted_time"] = df_predictions["total_predicted_journey"] + df_predictions["total_predicted_dwell"]

In [119]:
df_predictions[["DAYOFSERVICE","TRIPID","PROGRNUMBER","total_journey","total_dwell","overall_time","predicted_journey","total_predicted_journey","dwell_predictions","overall_predicted_time","total_predicted_dwell"]]

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,total_journey,total_dwell,overall_time,predicted_journey,total_predicted_journey,dwell_predictions,overall_predicted_time,total_predicted_dwell
0,736695,5956287,2,3458,224,3682,145.713287,3672.398438,22.881056,4127.814614,455.416176
1,736695,5956287,3,3458,224,3682,109.635406,3672.398438,11.180227,4127.814614,455.416176
2,736695,5956287,4,3458,224,3682,43.710220,3672.398438,4.532035,4127.814614,455.416176
3,736695,5956287,5,3458,224,3682,31.118746,3672.398438,9.873302,4127.814614,455.416176
4,736695,5956287,6,3458,224,3682,64.406311,3672.398438,10.247228,4127.814614,455.416176
...,...,...,...,...,...,...,...,...,...,...,...
738182,737059,8589898,71,2750,272,3022,25.403379,2953.987549,8.070208,3263.919481,309.931933
738183,737059,8589898,72,2750,272,3022,36.691921,2953.987549,4.837013,3263.919481,309.931933
738184,737059,8589898,73,2750,272,3022,23.014582,2953.987549,4.803165,3263.919481,309.931933
738185,737059,8589898,74,2750,272,3022,51.870636,2953.987549,0.574702,3263.919481,309.931933


In [120]:
df_predictions["difference"] = df_predictions["overall_predicted_time"] - df_predictions["overall_time"]

In [134]:
df_predictions.groupby(["DAYOFSERVICE","TRIPID"]).first().head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,PROGRNUMBER,STOPPOINTID,DIRECTION,ACTUALTIME_DEP,ACTUALTIME_ARR,hour,dayofweek,journey_time,dwell_time,prev_stop_id,...,holiday,total_journey,total_dwell,overall_time,predicted_journey,total_predicted_journey,dwell_predictions,overall_predicted_time,total_predicted_dwell,difference
DAYOFSERVICE,TRIPID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
736695,5956287,2,4521,1,34810,34797,9,0,130,13,7564,...,1,3458,224,3682,145.713287,3672.398438,22.881056,4127.814614,455.416176,445.814614
736695,5956288,2,7280,2,41414,41414,11,0,28,0,7289,...,1,6149,494,6643,29.26334,6513.944824,1.707897,7179.540667,665.595843,536.540667
736695,5956289,2,4521,1,48781,48732,13,0,181,49,7564,...,1,3276,483,3759,219.890427,3377.932861,52.598495,3981.23633,603.303468,222.23633
736695,5956290,2,4051,2,55888,55821,15,0,64,67,5111,...,1,6645,384,7029,78.779793,6669.36084,88.6451,7313.598625,644.237785,284.598625
736695,5956291,2,4521,1,63188,63123,17,0,188,65,7564,...,1,4051,469,4520,274.785522,4252.936035,79.62354,5130.041277,877.105241,610.041277
736695,5956292,2,7280,2,68449,68449,19,0,30,0,7289,...,1,5021,215,5236,31.258791,4879.57666,1.650497,5290.647931,411.071271,54.647931
736695,5956293,2,4521,1,74048,74006,20,0,284,42,7564,...,1,3964,402,4366,275.356262,3914.331055,62.139896,4494.610941,580.279887,128.610941
736695,5956294,2,7280,2,79245,79245,22,0,32,0,7289,...,1,3369,273,3642,30.854471,3427.838135,2.067424,3704.268723,276.430589,62.268723
736695,5966066,2,4521,1,83857,83814,23,0,195,43,7564,...,1,2491,200,2691,188.983719,2659.438721,57.24484,3026.887082,367.448361,335.887082
736695,5966067,2,4051,2,87654,87638,0,0,51,16,5111,...,1,2606,148,2754,51.06749,2614.208008,17.83711,2738.996923,124.788915,-15.003077
