# Features preprocessing

## 1. MinMax scale

In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from datetime import datetime
import pygeohash as pgh

In [2]:
pwd

'/home/catherine/code/elsebasmar/london-bss/londonbss/notebooks/data_collection'

In [3]:
data = pd.read_csv("../../../raw_data/final_features_df.csv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,...,event_latitude,event_longitude,London_zone_Central,London_zone_North,London_zone_South_West,London_zone_West,elisabeth_line,lockdown,school_holidays,strike
0,0,2022-06-21 00:00:00,12.5,0.0,0.0,0.0,11,8.8,348,2022-06-21,...,,,,,,,True,False,,tube
1,1,2022-06-21 00:00:00,12.5,0.0,0.0,0.0,11,8.8,348,2022-06-21,...,,,,,,,True,False,,train
2,2,2022-06-21 01:00:00,11.5,0.0,0.0,0.0,0,7.6,355,2022-06-21,...,,,,,,,True,False,,tube
3,3,2022-06-21 01:00:00,11.5,0.0,0.0,0.0,0,7.6,355,2022-06-21,...,,,,,,,True,False,,train
4,4,2022-06-21 02:00:00,10.6,0.0,0.0,0.0,1,8.0,333,2022-06-21,...,,,,,,,True,False,,tube


In [5]:
scaler = MinMaxScaler()
data["temperature"] = scaler.fit_transform(data[['temperature']])
data["rainfall"] = scaler.fit_transform(data[['rainfall']])
data["snowfall"] = scaler.fit_transform(data[['snowfall']])
data["cloudcover"] = scaler.fit_transform(data[['cloudcover']])
data["wind_speed"] = scaler.fit_transform(data[['wind_speed']])
data["wind_direction"] = scaler.fit_transform(data[['wind_direction']])

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,...,event_latitude,event_longitude,London_zone_Central,London_zone_North,London_zone_South_West,London_zone_West,elisabeth_line,lockdown,school_holidays,strike
0,0,2022-06-21 00:00:00,0.202703,0.0,0.0,0.0,0.11,0.310484,0.966574,2022-06-21,...,,,,,,,True,False,,tube
1,1,2022-06-21 00:00:00,0.202703,0.0,0.0,0.0,0.11,0.310484,0.966574,2022-06-21,...,,,,,,,True,False,,train
2,2,2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.0,0.262097,0.986072,2022-06-21,...,,,,,,,True,False,,tube
3,3,2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.0,0.262097,0.986072,2022-06-21,...,,,,,,,True,False,,train
4,4,2022-06-21 02:00:00,0.138514,0.0,0.0,0.0,0.01,0.278226,0.924791,2022-06-21,...,,,,,,,True,False,,tube


In [7]:
scaled_data = data

## 2. Add dates details columns

In [8]:
scaled_data["year"] = scaled_data["timestamp"].str[:4]
scaled_data["month"] = scaled_data["timestamp"].str[6:7]
scaled_data["day"] = scaled_data["timestamp"].str[9:10]
scaled_data["hour"] = scaled_data["timestamp"].str[12:13]
scaled_data["minute"] = scaled_data["timestamp"].str[15:16]
scaled_data["second"] = scaled_data["timestamp"].str[18:19]

In [9]:
scaled_data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,...,elisabeth_line,lockdown,school_holidays,strike,year,month,day,hour,minute,second
0,0,2022-06-21 00:00:00,0.202703,0.0,0.0,0.0,0.11,0.310484,0.966574,2022-06-21,...,True,False,,tube,2022,6,1,0,0,0
1,1,2022-06-21 00:00:00,0.202703,0.0,0.0,0.0,0.11,0.310484,0.966574,2022-06-21,...,True,False,,train,2022,6,1,0,0,0
2,2,2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.0,0.262097,0.986072,2022-06-21,...,True,False,,tube,2022,6,1,1,0,0
3,3,2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.0,0.262097,0.986072,2022-06-21,...,True,False,,train,2022,6,1,1,0,0
4,4,2022-06-21 02:00:00,0.138514,0.0,0.0,0.0,0.01,0.278226,0.924791,2022-06-21,...,True,False,,tube,2022,6,1,2,0,0


In [10]:
scaled_data.drop(columns="Unnamed: 0", inplace=True)

In [11]:
scaled_data["timestamp"] = pd.to_datetime(scaled_data["timestamp"])

In [12]:
scaled_data.dtypes

timestamp                 datetime64[ns]
temperature                      float64
precipitation                    float64
rainfall                         float64
snowfall                         float64
cloudcover                       float64
wind_speed                       float64
wind_direction                   float64
date                              object
sunrise_datetime                  object
sunset_datetime                   object
encoding                          object
event_title                       object
event_location                    object
event_latitude                    object
event_longitude                   object
London_zone_Central              float64
London_zone_North                float64
London_zone_South_West           float64
London_zone_West                 float64
elisabeth_line                      bool
lockdown                            bool
school_holidays                   object
strike                            object
year            

In [13]:
scaled_data["weekday"] = scaled_data["timestamp"].dt.weekday

In [14]:
scaled_data.set_index("timestamp", inplace=True)

In [15]:
scaled_data.head()

Unnamed: 0_level_0,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,sunrise_datetime,sunset_datetime,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.0,0.11,0.310484,0.966574,2022-06-21,2022-06-21 04:43:00,2022-06-21 21:21:00,...,False,,tube,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.0,0.11,0.310484,0.966574,2022-06-21,2022-06-21 04:43:00,2022-06-21 21:21:00,...,False,,train,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.0,0.262097,0.986072,2022-06-21,2022-06-21 04:43:00,2022-06-21 21:21:00,...,False,,tube,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.0,0.262097,0.986072,2022-06-21,2022-06-21 04:43:00,2022-06-21 21:21:00,...,False,,train,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.0,0.01,0.278226,0.924791,2022-06-21,2022-06-21 04:43:00,2022-06-21 21:21:00,...,False,,tube,2022,6,1,2,0,0,1


## 3. Drop columns

In [16]:
droped_data = scaled_data.drop(columns=['precipitation','date','sunrise_datetime','sunset_datetime','event_location'])

In [17]:
droped_data.shape

(2376, 25)

In [18]:
droped_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,encoding,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,nighttime,,,,...,False,,tube,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,nighttime,,,,...,False,,train,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,nighttime,,,,...,False,,tube,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,nighttime,,,,...,False,,train,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.01,0.278226,0.924791,nighttime,,,,...,False,,tube,2022,6,1,2,0,0,1


## 4.Day / night time encoding

In [19]:
dnte_data = droped_data.copy()
dnte_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,encoding,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,nighttime,,,,...,False,,tube,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,nighttime,,,,...,False,,train,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,nighttime,,,,...,False,,tube,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,nighttime,,,,...,False,,train,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.01,0.278226,0.924791,nighttime,,,,...,False,,tube,2022,6,1,2,0,0,1


In [20]:
dnte_data.rename(columns={"encoding": "daytime"}, inplace=True)

In [21]:
dnte_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,nighttime,,,,...,False,,tube,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,nighttime,,,,...,False,,train,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,nighttime,,,,...,False,,tube,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,nighttime,,,,...,False,,train,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.01,0.278226,0.924791,nighttime,,,,...,False,,tube,2022,6,1,2,0,0,1


In [22]:
dnte_data["daytime"].value_counts()

daytime
daytime      1484
nighttime     892
Name: count, dtype: int64

In [23]:
dnte_data["daytime"] = dnte_data["daytime"].replace("daytime", "1")
dnte_data["daytime"] = dnte_data["daytime"].replace("nighttime", "0")

In [24]:
dnte_data["daytime"].astype("int")

timestamp
2022-06-21 00:00:00    0
2022-06-21 00:00:00    0
2022-06-21 01:00:00    0
2022-06-21 01:00:00    0
2022-06-21 02:00:00    0
                      ..
2022-09-23 19:00:00    0
2022-09-23 20:00:00    0
2022-09-23 21:00:00    0
2022-09-23 22:00:00    0
2022-09-23 23:00:00    0
Name: daytime, Length: 2376, dtype: int64

In [25]:
dnte_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,,,,...,False,,tube,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,,,,...,False,,train,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,,,,...,False,,tube,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,,,,...,False,,train,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.01,0.278226,0.924791,0,,,,...,False,,tube,2022,6,1,2,0,0,1


## 5. Event encoding

In [26]:
event_encoding = dnte_data.copy()

In [27]:
dnte_data.columns

Index(['temperature', 'rainfall', 'snowfall', 'cloudcover', 'wind_speed',
       'wind_direction', 'daytime', 'event_title', 'event_latitude',
       'event_longitude', 'London_zone_Central', 'London_zone_North',
       'London_zone_South_West', 'London_zone_West', 'elisabeth_line',
       'lockdown', 'school_holidays', 'strike', 'year', 'month', 'day', 'hour',
       'minute', 'second', 'weekday'],
      dtype='object')

In [28]:
event_encoding['London_zone_Central'].value_counts()

London_zone_Central
0.0    528
1.0     72
Name: count, dtype: int64

In [29]:
event_encoding['event_title'] = event_encoding['event_title'].fillna(0)

In [30]:
event_encoding['event_title'].value_counts()

event_title
0                                                           1776
Wimbledon Tennis Championships                               336
Taste of London                                              120
Notting Hill Carnival                                         48
Pride in London                                               24
UEFA Women Euro Final                                         24
Summer bank holiday                                           24
Bank Holiday for the State Funeral of Queen Elizabeth II      24
Name: count, dtype: int64

In [31]:
def encoding_strings(text):
    if text == 0:
        pass
    else:
        text = 1
    return text

In [32]:
event_encoding['event_title'] = event_encoding['event_title'].apply(encoding_strings)
event_encoding['event_title'].value_counts()

event_title
0    1776
1     600
Name: count, dtype: int64

In [33]:
event_encoding['event_latitude'].value_counts()

event_latitude
51.43542262    336
51.53142398    120
London-wide     72
51.5132348      48
51.55616476     24
Name: count, dtype: int64

In [34]:
event_encoding['event_latitude'].isna

<bound method Series.isna of timestamp
2022-06-21 00:00:00    NaN
2022-06-21 00:00:00    NaN
2022-06-21 01:00:00    NaN
2022-06-21 01:00:00    NaN
2022-06-21 02:00:00    NaN
                      ... 
2022-09-23 19:00:00    NaN
2022-09-23 20:00:00    NaN
2022-09-23 21:00:00    NaN
2022-09-23 22:00:00    NaN
2022-09-23 23:00:00    NaN
Name: event_latitude, Length: 2376, dtype: object>

In [35]:
def make_floats(x):
    print(x)
    if x == "London-wide":
        x = "London-wide"
    elif type(x) == str:
        x = float(x)
    else:
        x = 0
    return x

In [36]:
event_encoding['event_latitude'] = event_encoding['event_latitude'].apply(make_floats)
event_encoding['event_longitude'] = event_encoding['event_longitude'].apply(make_floats)

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.43542262
51.4

In [37]:
event_encoding['event_latitude'].isna

<bound method Series.isna of timestamp
2022-06-21 00:00:00    0
2022-06-21 00:00:00    0
2022-06-21 01:00:00    0
2022-06-21 01:00:00    0
2022-06-21 02:00:00    0
                      ..
2022-09-23 19:00:00    0
2022-09-23 20:00:00    0
2022-09-23 21:00:00    0
2022-09-23 22:00:00    0
2022-09-23 23:00:00    0
Name: event_latitude, Length: 2376, dtype: object>

In [38]:
event_encoding.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,0,0,0,...,False,,tube,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,0,0,0,...,False,,train,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,0,0,0,...,False,,tube,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,0,0,0,...,False,,train,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.01,0.278226,0.924791,0,0,0,0,...,False,,tube,2022,6,1,2,0,0,1


## 6. Bool to int encoding

In [39]:
booltoint = event_encoding.copy()

In [40]:
booltoint['elisabeth_line'] = np.where(booltoint['elisabeth_line'], '1', '0')

In [41]:
booltoint['lockdown'] = np.where(booltoint['lockdown'], '1', '0')

In [42]:
booltoint.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,0,0,0,...,0,,tube,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,0,0,0,...,0,,train,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,0,0,0,...,0,,tube,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,0,0,0,...,0,,train,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.01,0.278226,0.924791,0,0,0,0,...,0,,tube,2022,6,1,2,0,0,1


## 7. Strikes and holidays encoding

In [43]:
strikes_holidays = booltoint.copy()

In [44]:
strikes_holidays['strike'] = strikes_holidays['strike'].fillna(0)

In [45]:
strikes_holidays['strike'].value_counts()

strike
0        2016
train     264
tube       96
Name: count, dtype: int64

In [46]:
strikes_holidays['strike'] = strikes_holidays['strike'].apply(encoding_strings)

In [47]:
strikes_holidays['strike'].value_counts()

strike
0    2016
1     360
Name: count, dtype: int64

In [48]:
strikes_holidays['school_holidays'] = strikes_holidays['school_holidays'].fillna(0)
strikes_holidays['school_holidays'] = strikes_holidays['school_holidays'].apply(encoding_strings)

In [49]:
strikes_holidays['school_holidays'].value_counts()

school_holidays
0    1320
1    1056
Name: count, dtype: int64

In [50]:
strikes_holidays.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,0,0,0,...,0,0,1,2022,6,1,0,0,0,1
2022-06-21 00:00:00,0.202703,0.0,0.0,0.11,0.310484,0.966574,0,0,0,0,...,0,0,1,2022,6,1,0,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,0,0,0,...,0,0,1,2022,6,1,1,0,0,1
2022-06-21 01:00:00,0.168919,0.0,0.0,0.0,0.262097,0.986072,0,0,0,0,...,0,0,1,2022,6,1,1,0,0,1
2022-06-21 02:00:00,0.138514,0.0,0.0,0.01,0.278226,0.924791,0,0,0,0,...,0,0,1,2022,6,1,2,0,0,1


In [52]:
strikes_holidays.to_csv('../../../raw_data/final_features_preproc.csv')