In [1]:
from sshtunnel import SSHTunnelForwarder
import pymysql
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import numpy as np
import matplotlib.patches as mpatches

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import r2_score #R square
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgbm
from sklearn.svm import SVR
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import pca as pca

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

## code used to clean and create useful columns for each line

In [1747]:
df=pd.read_csv('routeWithPN/route67_WP.csv')

In [1748]:
#   add extra features: 'date', 'month', 'weekday', 'dep_hour', 'arr_hour'
df.DAYOFSERVICE = pd.to_datetime(df.DAYOFSERVICE) 
df['date'] = df['DAYOFSERVICE'].dt.day
df['month'] = df['DAYOFSERVICE'].dt.month
df['weekday'] = df['DAYOFSERVICE'].dt.weekday
df['dep_hour'] = (df['ACTUALTIME_DEP'] // 3600) - 1
df['arr_hour'] = (df['ACTUALTIME_ARR'] // 3600) - 1
    
#   drop the rows which arr_hour < 25
df = df.loc[df.arr_hour <= 24]


# Sort and reindex the df so that each trip is in order
df = df.sort_values(['DIRECTION', 'DAYOFSERVICE','TRIPID', 'PROGRNUMBER'], ascending=[True, True, True, True])
df = df.reset_index(drop=True)


df.loc[(df.PROGRNUMBER > df.PROGRNUMBER.shift(1)) &
(df.TRIPID == df.TRIPID.shift(1)) & 
(df.DIRECTION == df.DIRECTION.shift(1)) &
(df.DAYOFSERVICE == df.DAYOFSERVICE.shift(1))
, 'journey_time'] = (df.ACTUALTIME_ARR - df.ACTUALTIME_ARR.shift(1))

    
df.loc[(df.PROGRNUMBER > df.PROGRNUMBER.shift(1)) &
          (df.TRIPID == df.TRIPID.shift(1)) & 
          (df.DIRECTION == df.DIRECTION.shift(1)) &
          (df.DAYOFSERVICE == df.DAYOFSERVICE.shift(1))
          , 'planned_journey_time'] = (df.PLANNEDTIME_ARR - df.PLANNEDTIME_ARR.shift(1))
    

    
    # Calc dwell time
    # df_rev1_1['dwell_time'] = df_rev1_1.ACTUALTIME_DEP - df_rev1_1.ACTUALTIME_ARR

    # Get the previous stop's id
df.loc[(df.PROGRNUMBER > df.PROGRNUMBER.shift(1)) &
              (df.TRIPID == df.TRIPID.shift(1)) & 
              (df.DIRECTION == df.DIRECTION.shift(1)) &
              (df.DAYOFSERVICE == df.DAYOFSERVICE.shift(1))
              , 'prev_stop_id'] = df.STOPPOINTID.shift(1)

    # Create a segment id by concatenating prev stop id and currnet stop id
df["segment_id"] = df['prev_stop_id'].map(str) + "-" + df['STOPPOINTID'].map(str)

# If the progrnumber is 1, that is the first stop, ie there is no previous stop
# set the journey time, seg id and prev stop to NaN
df.loc[df['PROGRNUMBER'] == 1, "journey_time"] = np.nan
df.loc[df['PROGRNUMBER'] == 1, "prev_stop_id"] = np.nan
df.loc[df['PROGRNUMBER'] == 1, "segment_id"] = np.nan
    
    
    #  add isPeaktime feature
    #  value 1: 6:30 - 9:30, 15:30 - 18:30 
    #  value: otherwise
df['isPeaktime'] = 0
df['isPeaktime'] = np.where(((df.ACTUALTIME_DEP >= 23400) & (df.ACTUALTIME_DEP <= 34200)) |
                               ((df.ACTUALTIME_DEP >= 55800) & (df.ACTUALTIME_DEP <= 66600))
                               , 1, df['isPeaktime'])

In [1749]:
df['difference']=df['planned_journey_time'] - df['journey_time']

In [1750]:
df["line_mean_difference"]=df["difference"].mean()

In [1751]:
df["abs_line_mean_difference"]=df["difference"].abs().mean()

In [1752]:
df['absDiff']=df['difference'].abs()

In [1753]:
df["trip_mean_difference"]=df.groupby(df['TRIPID']).difference.transform('mean')

In [1754]:
df["abs_trip_mean_difference"]=df.groupby(df['TRIPID']).absDiff.transform('mean')

In [1755]:
df["abs_trip_sum_difference"]=df.groupby(df['TRIPID']).absDiff.transform('sum')

In [1756]:
df["trip_sum_difference"]=df.groupby(df['TRIPID']).difference.transform('sum')

In [1757]:
df = df.drop(columns=['Unnamed: 0','STOPPOINTID','PROGRNUMBER',
                      'ACTUALTIME_DEP','ACTUALTIME_ARR',
                      'prev_stop_id','dep_hour',
                      'PLANNEDTIME_ARR','PLANNEDTIME_DEP'
                           
                     ])

In [1758]:
def delete_null_row(df):
    lis = ['journey_time', 'planned_journey_time','segment_id']
    for i in lis:
        df.drop(df[df[i].isnull().values==True].index,inplace = True)
delete_null_row(df)

In [1759]:
df.to_csv('baselinemodel/line67_BASELINE_M.csv')

## code used to get the mean and absolute mean between journey time difference(df['planned_journey_time'] - df['journey_time']) for all routes

In [2]:
df=pd.read_csv('baselinemodel/line67_BASELINE_M.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,DIRECTION,date,month,weekday,arr_hour,journey_time,...,segment_id,isPeaktime,difference,line_mean_difference,abs_line_mean_difference,absDiff,trip_mean_difference,abs_trip_mean_difference,abs_trip_sum_difference,trip_sum_difference
0,1,2018-01-01,5956326,67,1,1,1,0,11,106.0,...,7391.0-493,0,12.0,-5.821722,22.355927,12.0,1.705882,17.852941,1214.0,116.0
1,2,2018-01-01,5956326,67,1,1,1,0,11,79.0,...,493.0-494,0,12.0,-5.821722,22.355927,12.0,1.705882,17.852941,1214.0,116.0
2,3,2018-01-01,5956326,67,1,1,1,0,11,87.0,...,494.0-495,0,23.0,-5.821722,22.355927,23.0,1.705882,17.852941,1214.0,116.0
3,4,2018-01-01,5956326,67,1,1,1,0,11,99.0,...,495.0-400,0,-10.0,-5.821722,22.355927,10.0,1.705882,17.852941,1214.0,116.0
4,5,2018-01-01,5956326,67,1,1,1,0,11,41.0,...,400.0-346,0,23.0,-5.821722,22.355927,23.0,1.705882,17.852941,1214.0,116.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518401,1571155,2018-12-31,8589775,67,2,31,12,0,12,28.0,...,1478.0-1479,0,26.0,-5.821722,22.355927,26.0,-12.907348,25.929712,8116.0,-4040.0
1518402,1571156,2018-12-31,8589775,67,2,31,12,0,12,158.0,...,1479.0-315,0,-17.0,-5.821722,22.355927,17.0,-12.907348,25.929712,8116.0,-4040.0
1518403,1571157,2018-12-31,8589775,67,2,31,12,0,13,352.0,...,315.0-406,0,-159.0,-5.821722,22.355927,159.0,-12.907348,25.929712,8116.0,-4040.0
1518404,1571158,2018-12-31,8589775,67,2,31,12,0,13,118.0,...,406.0-2810,0,-14.0,-5.821722,22.355927,14.0,-12.907348,25.929712,8116.0,-4040.0


In [2]:
import glob

path = r'baselinemodel' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [None]:
frame

In [6]:
frame["difference"].mean()

-6.309524004291475

In [7]:
frame["difference"].abs().mean()

26.07248421811801

In [None]:
frame["abs_trip_sum_difference"].mean()