In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split, cross_validate

from data import *
from params import *

In [2]:
query = '''
SELECT * FROM `train-delays-406412.train_delays.train_delays_cleaned`
'''

In [3]:
df = get_data_from_bq(gcp_project=GCP_PROJECT, query=query)

In [4]:
df.head()

Unnamed: 0,TRUST_TRAIN_ID_AFFECTED,PLANNED_ORIG_WTT_DATETIME_AFF,PLANNED_DEST_WTT_DATETIME_AFF,TRAIN_SERVICE_CODE_AFFECTED,SERVICE_GROUP_CODE_AFFECTED,ENGLISH_DAY_TYPE,APP_TIMETABLE_FLAG_AFF,TRACTION_TYPE_AFFECTED,UNIT_CLASS_AFFECTED,RESPONSIBLE_MANAGER,INCIDENT_REASON,REACTIONARY_REASON_CODE,PERFORMANCE_EVENT_CODE,PFPI_MINUTES,Lat_OR,Lon_OR,STATION_OR,Lat_DES,Lon_DES,STATION_DES
0,522N48MJ08,2018-12-08 11:45:00,2018-12-08 12:47:00,22214000,EK01,SA,Y,EMU,375.0,MEKJ,MD,YI,M,9.0,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK
1,522Y72MF19,2022-12-19 09:36:00,2022-12-19 10:39:00,22214000,EK01,WD,Y,EMU,378.0,MEKJ,MD,YI,M,12.0,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK
2,522N20MZ30,2023-01-30 20:17:00,2023-01-30 21:19:00,22214000,EK01,WD,Y,EMU,378.0,XQHM,X8,YI,M,8.0,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK
3,522N02MX24,2018-11-24 18:33:00,2018-11-24 19:32:00,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,YI,M,10.0,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK
4,522Y08MX24,2018-11-24 18:53:00,2018-11-24 19:55:00,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,YI,M,11.0,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK


In [5]:
df.loc[:, 'PLANNED_ORIG_WTT_DATETIME_AFF'] = pd.to_datetime(df.PLANNED_ORIG_WTT_DATETIME_AFF)
df.loc[:, 'ORIG_MONTH'] = df.PLANNED_ORIG_WTT_DATETIME_AFF.dt.month
df.loc[:, 'ORIG_DAY'] = df.PLANNED_ORIG_WTT_DATETIME_AFF.dt.day
df.loc[:, 'ORIG_HOUR'] = df.PLANNED_ORIG_WTT_DATETIME_AFF.dt.hour
df.loc[:, 'ORIG_MINUTE'] = df.PLANNED_ORIG_WTT_DATETIME_AFF.dt.minute
df.drop(columns='PLANNED_ORIG_WTT_DATETIME_AFF', inplace = True)

In [6]:
df.head()

Unnamed: 0,TRUST_TRAIN_ID_AFFECTED,PLANNED_DEST_WTT_DATETIME_AFF,TRAIN_SERVICE_CODE_AFFECTED,SERVICE_GROUP_CODE_AFFECTED,ENGLISH_DAY_TYPE,APP_TIMETABLE_FLAG_AFF,TRACTION_TYPE_AFFECTED,UNIT_CLASS_AFFECTED,RESPONSIBLE_MANAGER,INCIDENT_REASON,...,Lat_OR,Lon_OR,STATION_OR,Lat_DES,Lon_DES,STATION_DES,ORIG_MONTH,ORIG_DAY,ORIG_HOUR,ORIG_MINUTE
0,522N48MJ08,2018-12-08 12:47:00,22214000,EK01,SA,Y,EMU,375.0,MEKJ,MD,...,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK,12,8,11,45
1,522Y72MF19,2022-12-19 10:39:00,22214000,EK01,WD,Y,EMU,378.0,MEKJ,MD,...,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK,12,19,9,36
2,522N20MZ30,2023-01-30 21:19:00,22214000,EK01,WD,Y,EMU,378.0,XQHM,X8,...,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK,1,30,20,17
3,522N02MX24,2018-11-24 19:32:00,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,...,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK,11,24,18,33
4,522Y08MX24,2018-11-24 19:55:00,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,...,51.54343,-0.02447,HACKNEY WICK,51.54343,-0.02447,HACKNEY WICK,11,24,18,53


In [7]:
df.loc[:, 'PLANNED_DEST_WTT_DATETIME_AFF'] = pd.to_datetime(df.PLANNED_DEST_WTT_DATETIME_AFF)
df.loc[:, 'DEST_MONTH'] = df.PLANNED_DEST_WTT_DATETIME_AFF.dt.month
df.loc[:, 'DEST_DAY'] = df.PLANNED_DEST_WTT_DATETIME_AFF.dt.day
df.loc[:, 'DEST_HOUR'] = df.PLANNED_DEST_WTT_DATETIME_AFF.dt.hour
df.loc[:, 'DEST_MINUTE'] = df.PLANNED_DEST_WTT_DATETIME_AFF.dt.minute
df.drop(columns='PLANNED_DEST_WTT_DATETIME_AFF', inplace = True)

In [8]:
df.head()

Unnamed: 0,TRUST_TRAIN_ID_AFFECTED,TRAIN_SERVICE_CODE_AFFECTED,SERVICE_GROUP_CODE_AFFECTED,ENGLISH_DAY_TYPE,APP_TIMETABLE_FLAG_AFF,TRACTION_TYPE_AFFECTED,UNIT_CLASS_AFFECTED,RESPONSIBLE_MANAGER,INCIDENT_REASON,REACTIONARY_REASON_CODE,...,Lon_DES,STATION_DES,ORIG_MONTH,ORIG_DAY,ORIG_HOUR,ORIG_MINUTE,DEST_MONTH,DEST_DAY,DEST_HOUR,DEST_MINUTE
0,522N48MJ08,22214000,EK01,SA,Y,EMU,375.0,MEKJ,MD,YI,...,-0.02447,HACKNEY WICK,12,8,11,45,12,8,12,47
1,522Y72MF19,22214000,EK01,WD,Y,EMU,378.0,MEKJ,MD,YI,...,-0.02447,HACKNEY WICK,12,19,9,36,12,19,10,39
2,522N20MZ30,22214000,EK01,WD,Y,EMU,378.0,XQHM,X8,YI,...,-0.02447,HACKNEY WICK,1,30,20,17,1,30,21,19
3,522N02MX24,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,YI,...,-0.02447,HACKNEY WICK,11,24,18,33,11,24,19,32
4,522Y08MX24,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,YI,...,-0.02447,HACKNEY WICK,11,24,18,53,11,24,19,55


In [12]:
df.head()

Unnamed: 0,TRUST_TRAIN_ID_AFFECTED,TRAIN_SERVICE_CODE_AFFECTED,SERVICE_GROUP_CODE_AFFECTED,ENGLISH_DAY_TYPE,APP_TIMETABLE_FLAG_AFF,TRACTION_TYPE_AFFECTED,UNIT_CLASS_AFFECTED,RESPONSIBLE_MANAGER,INCIDENT_REASON,REACTIONARY_REASON_CODE,...,Lon_DES,STATION_DES,ORIG_MONTH,ORIG_DAY,ORIG_HOUR,ORIG_MINUTE,DEST_MONTH,DEST_DAY,DEST_HOUR,DEST_MINUTE
0,522N48MJ08,22214000,EK01,SA,Y,EMU,375.0,MEKJ,MD,YI,...,-0.02447,HACKNEY WICK,12,8,11,45,12,8,12,47
1,522Y72MF19,22214000,EK01,WD,Y,EMU,378.0,MEKJ,MD,YI,...,-0.02447,HACKNEY WICK,12,19,9,36,12,19,10,39
2,522N20MZ30,22214000,EK01,WD,Y,EMU,378.0,XQHM,X8,YI,...,-0.02447,HACKNEY WICK,1,30,20,17,1,30,21,19
3,522N02MX24,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,YI,...,-0.02447,HACKNEY WICK,11,24,18,33,11,24,19,32
4,522Y08MX24,22214000,EK01,SA,Y,EMU,375.0,TEKA,TG,YI,...,-0.02447,HACKNEY WICK,11,24,18,53,11,24,19,55


In [22]:
df['TRAIN_SERVICE_CODE_AFFECTED'].unique()

array([22214000, 22204000, 21921000, 22216000, 25234001, 21234001,
       21252001, 21235001, 25235001, 22206000, 22215003, 22218000,
       21237001, 22215002])

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor

In [15]:
X = df.drop(columns=['PFPI_MINUTES'])
y = df['PFPI_MINUTES']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [18]:
X_train.head()

Unnamed: 0,TRUST_TRAIN_ID_AFFECTED,TRAIN_SERVICE_CODE_AFFECTED,SERVICE_GROUP_CODE_AFFECTED,ENGLISH_DAY_TYPE,APP_TIMETABLE_FLAG_AFF,TRACTION_TYPE_AFFECTED,UNIT_CLASS_AFFECTED,RESPONSIBLE_MANAGER,INCIDENT_REASON,REACTIONARY_REASON_CODE,...,Lon_DES,STATION_DES,ORIG_MONTH,ORIG_DAY,ORIG_HOUR,ORIG_MINUTE,DEST_MONTH,DEST_DAY,DEST_HOUR,DEST_MINUTE
165480,529E11MC09,22218000,EK99,WD,Y,EMU,375.0,ZQBG,ZS,YD,...,-0.05975,,4,9,8,29,4,9,8,50
48643,879D46MT15,22215003,EK03,WD,Y,EMU,378.0,IQM1,IR,,...,-0.04083,NEW CROSS GATE,6,15,16,46,6,15,17,36
439286,872N97MS17,22214000,EK01,WD,Y,EMU,375.0,QQHP,QN,YB,...,-0.24454,,10,17,16,9,10,17,17,11
158268,879B12MC22,22218000,EK99,WD,Y,EMU,375.0,SETG,RB,YD,...,-0.05975,,3,22,8,17,3,22,8,59
235038,722J25M707,21921000,EK01,WD,Y,EMU,710.0,IQH2,JS,,...,0.04445,Woodgrange Park Rail Station,7,10,21,51,7,10,22,26


In [19]:
model = SGDRegressor(max_iter=1000, loss='mean_absolute_error')

In [None]:
X_train_processed = preprocess(X_train)