# CLEAN DATA

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

In [6]:
df = pd.read_csv('clean_data.csv')
df.head(5)

Unnamed: 0,TRAIN_SERVICE_CODE_AFFECTED,SERVICE_GROUP_CODE_AFFECTED,ENGLISH_DAY_TYPE,APP_TIMETABLE_FLAG_AFF,UNIT_CLASS_AFFECTED,INCIDENT_REASON,PERFORMANCE_EVENT_CODE,PFPI_MINUTES,Lat_OR,Lon_OR,...,ORIG_MINUTE_SIN,ORIG_MINUTE_COS,DEST_MONTH_SIN,DEST_MONTH_COS,DEST_DAY_SIN,DEST_DAY_COS,DEST_HOUR_SIN,DEST_HOUR_COS,DEST_MINUTE_SIN,DEST_MINUTE_COS
0,22214000,EK01,SA,Y,375.0,MD,M,9.0,51.54343,-0.02447,...,-1.0,-1.83697e-16,0.951057,0.309017,0.7431448,0.669131,0.951057,0.309017,-0.978148,0.207912
1,22214000,EK01,WD,Y,378.0,MD,M,12.0,51.54343,-0.02447,...,-0.587785,-0.809017,0.951057,0.309017,0.9135455,-0.406737,0.866025,0.5,-0.809017,-0.587785
2,22214000,EK01,WD,Y,378.0,X8,M,8.0,51.54343,-0.02447,...,0.978148,-0.2079117,0.104528,0.994522,5.665539e-16,-1.0,0.809017,-0.587785,0.913545,-0.406737
3,22214000,EK01,SA,Y,375.0,TG,M,10.0,51.54343,-0.02447,...,-0.309017,-0.9510565,0.913545,0.406737,0.5877853,-0.809017,0.913545,-0.406737,-0.207912,-0.978148
4,22214000,EK01,SA,Y,375.0,TG,M,11.0,51.54343,-0.02447,...,-0.669131,0.7431448,0.913545,0.406737,0.5877853,-0.809017,0.913545,-0.406737,-0.5,0.866025


In [9]:
X = df.drop(columns = 'PFPI_MINUTES')
y = df['PFPI_MINUTES']

In [10]:
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, shuffle = True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((345275, 27), (345275,), (147976, 27), (147976,))

In [11]:
num_transformer = MinMaxScaler()

cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output = False)

transformer = make_column_transformer((num_transformer, ['Lat_OR','Lon_OR', 'Lat_DES','Lon_DES']),
                                  (cat_transformer, ['ENGLISH_DAY_TYPE', 'SERVICE_GROUP_CODE_AFFECTED', 'INCIDENT_REASON',
                                                        'UNIT_CLASS_AFFECTED', 'TRAIN_SERVICE_CODE_AFFECTED', 
                                                     'PERFORMANCE_EVENT_CODE', 
                                                     'APP_TIMETABLE_FLAG_AFF']),
                                remainder = 'passthrough')


pipe = Pipeline([('transformer', transformer)])

pipe

In [12]:
pipe.fit(X_train)

In [13]:
X_processed = pipe.transform(X_train)
X_processed

array([[ 0.67484368,  0.17080839,  0.78949728, ..., -0.5       ,
         0.9945219 , -0.10452846],
       [ 0.60873068,  0.20944053,  0.60873068, ..., -0.30901699,
         0.9781476 , -0.20791169],
       [ 0.71911082,  0.27553217,  0.71911082, ...,  0.74314483,
        -0.58778525,  0.80901699],
       ...,
       [ 0.86796578,  0.45677839,  0.86796578, ..., -0.40673664,
         0.58778525,  0.80901699],
       [ 0.70115846,  0.36121527,  0.65004411, ..., -0.66913061,
        -0.10452846,  0.9945219 ],
       [ 0.67484368,  0.17080839,  0.78949728, ...,  0.20791169,
         0.95105652, -0.30901699]])