# Features preprocessing

## 1. MinMax scale

In [31]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from datetime import datetime
import pygeohash as pgh

In [32]:
pwd

'/home/catherine/code/elsebasmar/london-bss/londonbss/notebooks/data_collection'

In [33]:
data = pd.read_csv("../../../raw_data/final_features_df.csv")

In [34]:
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,...,London_zone_South_East,London_zone_East,London_zone_London_all,London_zone_North,London_zone_South_West,London_zone_West,elisabeth_line,lockdown,school_holidays,strike
0,0,2022-01-02 00:00:00,12.5,0.1,0.1,0.0,100,6.9,189,2022-01-02,...,,,,,,,False,False,christmas,
1,1,2022-01-02 01:00:00,12.4,0.1,0.1,0.0,100,20.2,239,2022-01-02,...,,,,,,,False,False,christmas,
2,2,2022-01-02 02:00:00,12.4,0.0,0.0,0.0,100,19.0,241,2022-01-02,...,,,,,,,False,False,christmas,
3,3,2022-01-02 03:00:00,12.4,0.0,0.0,0.0,100,17.4,248,2022-01-02,...,,,,,,,False,False,christmas,
4,4,2022-01-02 04:00:00,12.4,0.0,0.0,0.0,100,17.4,248,2022-01-02,...,,,,,,,False,False,christmas,


In [35]:
scaler = MinMaxScaler()
data["temperature"] = scaler.fit(data[['temperature']])
data["rainfall"] = scaler.fit(data[['rainfall']])
data["snowfall"] = scaler.fit(data[['snowfall']])
data["cloudcover"] = scaler.fit(data[['cloudcover']])
data["wind_speed"] = scaler.fit(data[['wind_speed']])
data["wind_direction"] = scaler.fit(data[['wind_direction']])

In [36]:
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,...,London_zone_South_East,London_zone_East,London_zone_London_all,London_zone_North,London_zone_South_West,London_zone_West,elisabeth_line,lockdown,school_holidays,strike
0,0,2022-01-02 00:00:00,MinMaxScaler(),0.1,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,,,,,,,False,False,christmas,
1,1,2022-01-02 01:00:00,MinMaxScaler(),0.1,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,,,,,,,False,False,christmas,
2,2,2022-01-02 02:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,,,,,,,False,False,christmas,
3,3,2022-01-02 03:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,,,,,,,False,False,christmas,
4,4,2022-01-02 04:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,,,,,,,False,False,christmas,


In [37]:
scaled_data = data

## 2. Add dates details columns

In [38]:
scaled_data["year"] = scaled_data["timestamp"].str[:4]
scaled_data["month"] = scaled_data["timestamp"].str[6:7]
scaled_data["day"] = scaled_data["timestamp"].str[9:10]
scaled_data["hour"] = scaled_data["timestamp"].str[12:13]
scaled_data["minute"] = scaled_data["timestamp"].str[15:16]
scaled_data["second"] = scaled_data["timestamp"].str[18:19]

In [39]:
scaled_data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,...,elisabeth_line,lockdown,school_holidays,strike,year,month,day,hour,minute,second
0,0,2022-01-02 00:00:00,MinMaxScaler(),0.1,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,False,False,christmas,,2022,1,2,0,0,0
1,1,2022-01-02 01:00:00,MinMaxScaler(),0.1,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,False,False,christmas,,2022,1,2,1,0,0
2,2,2022-01-02 02:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,False,False,christmas,,2022,1,2,2,0,0
3,3,2022-01-02 03:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,False,False,christmas,,2022,1,2,3,0,0
4,4,2022-01-02 04:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,...,False,False,christmas,,2022,1,2,4,0,0


In [40]:
scaled_data.drop(columns="Unnamed: 0", inplace=True)

In [41]:
scaled_data["timestamp"] = pd.to_datetime(scaled_data["timestamp"])

In [42]:
scaled_data.dtypes

timestamp                 datetime64[ns]
temperature                       object
precipitation                    float64
rainfall                          object
snowfall                          object
cloudcover                        object
wind_speed                        object
wind_direction                    object
date                              object
sunrise_datetime                  object
sunset_datetime                   object
encoding                          object
event_title                       object
event_start_date                  object
event_end_date                    object
event_location                    object
event_latitude                    object
event_longitude                   object
London_zone_Central              float64
London_zone_South_East           float64
London_zone_East                 float64
London_zone_London_all           float64
London_zone_North                float64
London_zone_South_West           float64
London_zone_West

In [43]:
scaled_data["weekday"] = scaled_data["timestamp"].dt.weekday

In [44]:
scaled_data.set_index("timestamp", inplace=True)

In [45]:
scaled_data.head()

Unnamed: 0_level_0,temperature,precipitation,rainfall,snowfall,cloudcover,wind_speed,wind_direction,date,sunrise_datetime,sunset_datetime,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,MinMaxScaler(),0.1,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,2022-01-02 09:06:00,2022-01-02 17:03:00,...,False,christmas,,2022,1,2,0,0,0,6
2022-01-02 01:00:00,MinMaxScaler(),0.1,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,2022-01-02 09:06:00,2022-01-02 17:03:00,...,False,christmas,,2022,1,2,1,0,0,6
2022-01-02 02:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,2022-01-02 09:06:00,2022-01-02 17:03:00,...,False,christmas,,2022,1,2,2,0,0,6
2022-01-02 03:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,2022-01-02 09:06:00,2022-01-02 17:03:00,...,False,christmas,,2022,1,2,3,0,0,6
2022-01-02 04:00:00,MinMaxScaler(),0.0,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),2022-01-02,2022-01-02 09:06:00,2022-01-02 17:03:00,...,False,christmas,,2022,1,2,4,0,0,6


## 3. Drop columns

In [46]:
droped_data = scaled_data.drop(columns=['precipitation','date','sunrise_datetime','sunset_datetime','event_start_date','event_end_date','event_start_date','event_location'])

In [47]:
droped_data.shape

(9192, 28)

In [48]:
droped_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,encoding,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,0,0,0,6
2022-01-02 01:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,1,0,0,6
2022-01-02 02:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,2,0,0,6
2022-01-02 03:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,3,0,0,6
2022-01-02 04:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,4,0,0,6


## 4.Day / night time encoding

In [49]:
dnte_data = droped_data.copy()
dnte_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,encoding,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,0,0,0,6
2022-01-02 01:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,1,0,0,6
2022-01-02 02:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,2,0,0,6
2022-01-02 03:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,3,0,0,6
2022-01-02 04:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,4,0,0,6


In [50]:
dnte_data.rename(columns={"encoding": "daytime"}, inplace=True)

In [51]:
dnte_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,0,0,0,6
2022-01-02 01:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,1,0,0,6
2022-01-02 02:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,2,0,0,6
2022-01-02 03:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,3,0,0,6
2022-01-02 04:00:00,MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),MinMaxScaler(),nighttime,,,,...,False,christmas,,2022,1,2,4,0,0,6


In [52]:
dnte_data["daytime"].value_counts()

daytime
daytime      4698
nighttime    4494
Name: count, dtype: int64

In [53]:
dnte_data["daytime"].value_counts()

daytime
daytime      4698
nighttime    4494
Name: count, dtype: int64

In [54]:
dnte_data["daytime"] = dnte_data["daytime"].replace("daytime", "1")
dnte_data["daytime"] = dnte_data["daytime"].replace("nighttime", "0")

In [55]:
dnte_data["daytime"].astype("int")

timestamp
2022-01-02 00:00:00    0
2022-01-02 01:00:00    0
2022-01-02 02:00:00    0
2022-01-02 03:00:00    0
2022-01-02 04:00:00    0
                      ..
2023-01-01 21:00:00    0
2023-01-01 22:00:00    0
2023-01-01 22:00:00    0
2023-01-01 23:00:00    0
2023-01-01 23:00:00    0
Name: daytime, Length: 9192, dtype: int64

In [56]:
dnte_data["daytime"].value_counts()

daytime
1    4698
0    4494
Name: count, dtype: int64

In [25]:
dnte_data.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_title,event_latitude,event_longitude,...,lockdown,school_holidays,strike,year,month,day,hour,minute,second,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,0.429952,0.010204,0.0,1.0,0.108333,0.523677,0,,,,...,False,christmas,,2022,1,2,0,0,0,6
2022-01-02 01:00:00,0.427536,0.010204,0.0,1.0,0.33,0.662953,0,,,,...,False,christmas,,2022,1,2,1,0,0,6
2022-01-02 02:00:00,0.427536,0.0,0.0,1.0,0.31,0.668524,0,,,,...,False,christmas,,2022,1,2,2,0,0,6
2022-01-02 03:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,,,,...,False,christmas,,2022,1,2,3,0,0,6
2022-01-02 04:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,,,,...,False,christmas,,2022,1,2,4,0,0,6


## 5. Event encoding

In [26]:
event_encoding = dnte_data.copy()

In [27]:
event_encoding.columns

Index(['temperature', 'rainfall', 'snowfall', 'cloudcover', 'wind_speed',
       'wind_direction', 'daytime', 'event_title', 'event_latitude',
       'event_longitude', 'London_zone_Central', 'London_zone_South_East',
       'London_zone_East', 'London_zone_London_all', 'London_zone_North',
       'London_zone_South_West', 'London_zone_West', 'elisabeth_line',
       'lockdown', 'school_holidays', 'strike', 'year', 'month', 'day', 'hour',
       'minute', 'second', 'weekday'],
      dtype='object')

In [28]:
#event_encoding['event_title'] = event_encoding['event_title'].fillna(0)

In [29]:
event_encoding['event_title'].value_counts()

event_title
Winter Wonderland                                           1104
Wimbledon Tennis Championships                               336
London Film Festival                                         288
Frieze Art Fair                                              120
Chelsea Flower Show                                          120
Taste of London                                              120
New Year’s Day                                                48
Notting Hill Carnival                                         48
Trooping the Colour                                           24
Platinum Jubilee bank holiday                                 24
Christmas Day                                                 24
Boxing Day                                                    24
World Cup 2022 1/4 final                                      24
Easter Monday                                                 24
Guy Fawkes Night (Bonfire Night)                              24
Halloween in 

In [30]:
ohe = OneHotEncoder(sparse = False) 
ohe.fit(event_encoding[['event_title']]) 
event_encoding[ohe.get_feature_names_out()] = ohe.transform(event_encoding[['event_title']])
event_encoding.drop(columns = ["event_title"], inplace = True)



In [31]:
event_encoding.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_latitude,event_longitude,London_zone_Central,...,event_title_Spring bank holiday,event_title_Summer bank holiday,event_title_Survival Sunday (Premier League),event_title_Taste of London,event_title_Trooping the Colour,event_title_UEFA Women Euro Final,event_title_Wimbledon Tennis Championships,event_title_Winter Wonderland,event_title_World Cup 2022 1/4 final,event_title_nan
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,0.429952,0.010204,0.0,1.0,0.108333,0.523677,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 01:00:00,0.427536,0.010204,0.0,1.0,0.33,0.662953,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 02:00:00,0.427536,0.0,0.0,1.0,0.31,0.668524,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 03:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 04:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [32]:
def encoding_strings(text):
    if text == 0:
        pass
    else:
        text = 1
    return text

In [33]:
# event_encoding['event_title'] = event_encoding['event_title'].apply(encoding_strings)
# event_encoding['event_title'].value_counts()

In [34]:
event_encoding.shape

(9192, 56)

In [35]:
event_encoding = event_encoding.groupby('timestamp').sum()

In [36]:
event_encoding.shape

(8760, 56)

In [37]:
event_encoding.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_latitude,event_longitude,London_zone_Central,...,event_title_Spring bank holiday,event_title_Summer bank holiday,event_title_Survival Sunday (Premier League),event_title_Taste of London,event_title_Trooping the Colour,event_title_UEFA Women Euro Final,event_title_Wimbledon Tennis Championships,event_title_Winter Wonderland,event_title_World Cup 2022 1/4 final,event_title_nan
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,0.429952,0.010204,0.0,1.0,0.108333,0.523677,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 01:00:00,0.427536,0.010204,0.0,1.0,0.33,0.662953,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 02:00:00,0.427536,0.0,0.0,1.0,0.31,0.668524,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 03:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 04:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 6. Bool to int encoding

In [38]:
booltoint = event_encoding.copy()

In [39]:
booltoint['elisabeth_line'] = np.where(booltoint['elisabeth_line'], '1', '0')

In [40]:
booltoint['lockdown'] = np.where(booltoint['lockdown'], '1', '0')

In [41]:
booltoint.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_latitude,event_longitude,London_zone_Central,...,event_title_Spring bank holiday,event_title_Summer bank holiday,event_title_Survival Sunday (Premier League),event_title_Taste of London,event_title_Trooping the Colour,event_title_UEFA Women Euro Final,event_title_Wimbledon Tennis Championships,event_title_Winter Wonderland,event_title_World Cup 2022 1/4 final,event_title_nan
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,0.429952,0.010204,0.0,1.0,0.108333,0.523677,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 01:00:00,0.427536,0.010204,0.0,1.0,0.33,0.662953,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 02:00:00,0.427536,0.0,0.0,1.0,0.31,0.668524,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 03:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 04:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 7. Strikes and holidays encoding

In [42]:
strikes_holidays = booltoint.copy()

In [43]:
strikes_holidays['strike'] = strikes_holidays['strike'].fillna(0)

In [44]:
strikes_holidays['strike'].value_counts()

strike
0             7752
train          744
tube           144
tubetrain       72
traintrain      48
Name: count, dtype: int64

In [45]:
strikes_holidays['strike'] = strikes_holidays['strike'].apply(encoding_strings)

In [46]:
strikes_holidays['strike'].value_counts()

strike
0    7752
1    1008
Name: count, dtype: int64

In [47]:
strikes_holidays['school_holidays'] = strikes_holidays['school_holidays'].fillna(0)
strikes_holidays['school_holidays'] = strikes_holidays['school_holidays'].apply(encoding_strings)

In [48]:
strikes_holidays['school_holidays'].value_counts()

school_holidays
0    6288
1    2472
Name: count, dtype: int64

In [49]:
strikes_holidays.head()

Unnamed: 0_level_0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,daytime,event_latitude,event_longitude,London_zone_Central,...,event_title_Spring bank holiday,event_title_Summer bank holiday,event_title_Survival Sunday (Premier League),event_title_Taste of London,event_title_Trooping the Colour,event_title_UEFA Women Euro Final,event_title_Wimbledon Tennis Championships,event_title_Winter Wonderland,event_title_World Cup 2022 1/4 final,event_title_nan
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-02 00:00:00,0.429952,0.010204,0.0,1.0,0.108333,0.523677,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 01:00:00,0.427536,0.010204,0.0,1.0,0.33,0.662953,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 02:00:00,0.427536,0.0,0.0,1.0,0.31,0.668524,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 03:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2022-01-02 04:00:00,0.427536,0.0,0.0,1.0,0.283333,0.688022,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [50]:
strikes_holidays.shape

(8760, 56)

In [51]:
strikes_holidays.columns

Index(['temperature', 'rainfall', 'snowfall', 'cloudcover', 'wind_speed',
       'wind_direction', 'daytime', 'event_latitude', 'event_longitude',
       'London_zone_Central', 'London_zone_South_East', 'London_zone_East',
       'London_zone_London_all', 'London_zone_North', 'London_zone_South_West',
       'London_zone_West', 'elisabeth_line', 'lockdown', 'school_holidays',
       'strike', 'year', 'month', 'day', 'hour', 'minute', 'second', 'weekday',
       'event_title_Bank Holiday for the State Funeral of Queen Elizabeth II',
       'event_title_Boxing Day', 'event_title_Champions League Final',
       'event_title_Chelsea Flower Show', 'event_title_Christmas Day',
       'event_title_Diwali on Trafalgar Square',
       'event_title_Early May bank holiday', 'event_title_Easter Monday',
       'event_title_Frieze Art Fair', 'event_title_Good Friday',
       'event_title_Guy Fawkes Night (Bonfire Night)',
       'event_title_Halloween in London', 'event_title_London Film Festival',

In [52]:
strikes_holidays.describe()

Unnamed: 0,temperature,rainfall,snowfall,cloudcover,wind_speed,wind_direction,London_zone_Central,London_zone_South_East,London_zone_East,London_zone_London_all,...,event_title_Spring bank holiday,event_title_Summer bank holiday,event_title_Survival Sunday (Premier League),event_title_Taste of London,event_title_Trooping the Colour,event_title_UEFA Women Euro Final,event_title_Wimbledon Tennis Championships,event_title_Winter Wonderland,event_title_World Cup 2022 1/4 final,event_title_nan
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,...,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,0.433749,0.00802,0.000875,0.569807,0.240084,0.545353,0.216438,0.052055,0.079452,0.10411,...,0.00274,0.00274,0.00274,0.013699,0.00274,0.00274,0.038356,0.126027,0.00274,0.745205
std,0.184441,0.032277,0.025914,0.40234,0.134634,0.306444,0.462009,0.22215,0.270458,0.322865,...,0.052274,0.052274,0.052274,0.116243,0.052274,0.052274,0.192066,0.340055,0.052274,0.44817
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.311594,0.0,0.0,0.22,0.146667,0.328691,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.417874,0.0,0.0,0.56,0.211667,0.56546,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.538647,0.0,0.0,0.94,0.306667,0.710306,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.318841,1.0,1.0,2.0,1.133333,2.0,2.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0


In [53]:
strikes_holidays.drop(columns="event_latitude", inplace=True)

In [54]:
strikes_holidays.drop(columns="event_longitude", inplace=True)

In [55]:
strikes_holidays.shape

(8760, 54)

In [56]:
strikes_holidays.to_csv('../../../raw_data/final_features_preproc_12m.csv')  