In [1]:
"""
You need to run this cell for the code in following cells to work.
"""

# Enable module reloading
%load_ext autoreload
%autoreload 2

import os
os.chdir("..")

import pandas as pd
from src.data.pipelines import primary_use_pipeline, square_feet_pipeline, air_temperature_pipeline, \
    dew_temperature_pipeline, sea_level_pressure_pipeline, wind_speed_pipeline, wind_direction_pipeline

In [2]:
def check_mean_and_variance(df_column):
    mean = round(df_column.mean(), 2)
    var = round(df_column.var(), 2)
    print(f'mean = {mean}, variance = {var}')

# Data preprocessing

In [3]:
building_metadata = pd.read_csv('data/building_metadata.csv')
building_metadata.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


## Buildings data

### Primary use

In this preprocessing we merge less numerous categories of primary use to category `Other` and encode values using One Hot Encoding.

In [4]:
primary_use_feature = pd.DataFrame(primary_use_pipeline.fit_transform(building_metadata))
primary_use_feature.head()

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0


We can see that there is only 6 categories instead of 16 original and values are properly encoded.

### Square feet

In this preprocessing we scale values to zero mean unit variance.

In [5]:
square_feet_feature = pd.DataFrame(square_feet_pipeline.fit_transform(building_metadata))
square_feet_feature.head()

Unnamed: 0,0
0,-0.764729
1,-0.807282
2,-0.783297
3,-0.617951
4,0.221212


In [6]:
check_mean_and_variance(square_feet_feature[0])

mean = 0.0, variance = 1.0


We can see that are properly scaled.

## Weather data

In [7]:
train_weather = pd.read_csv('data/weather_train.csv')
train_weather

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6
...,...,...,...,...,...,...,...,...,...
139768,15,2016-12-31 19:00:00,3.0,,-8.0,,,180.0,5.7
139769,15,2016-12-31 20:00:00,2.8,2.0,-8.9,,1007.4,180.0,7.7
139770,15,2016-12-31 21:00:00,2.8,,-7.2,,1007.5,180.0,5.1
139771,15,2016-12-31 22:00:00,2.2,,-6.7,,1008.0,170.0,4.6


In [8]:
square_feet_feature = pd.DataFrame(square_feet_pipeline.fit_transform(building_metadata))
square_feet_feature.head()

Unnamed: 0,0
0,-0.764729
1,-0.807282
2,-0.783297
3,-0.617951
4,0.221212


### Air temperature

In this preprocessing we fill in missing values using rolling average and scale values to zero mean unit variance.

In [9]:
air_temperature_feature = pd.DataFrame(air_temperature_pipeline.fit_transform(train_weather))
air_temperature_feature.head()

Unnamed: 0,0
0,0.995738
1,0.939274
2,0.788704
3,0.628723
4,0.525206


In [10]:
air_temperature_feature[0].isna().sum()

0

In [11]:
check_mean_and_variance(air_temperature_feature[0])

mean = 0.0, variance = 1.0


We can see that there is zero missing values and values are properly scaled.

### Dew temperature

In this preprocessing we fill in missing values using rolling average and scale values to zero mean unit variance.

In [12]:
dew_temperature_feature = pd.DataFrame(dew_temperature_pipeline.fit_transform(train_weather))
dew_temperature_feature.head()

Unnamed: 0,0
0,1.292475
1,1.404847
2,1.404847
3,1.353769
4,1.292475


In [13]:
dew_temperature_feature[0].isna().sum()

0

In [14]:
check_mean_and_variance(dew_temperature_feature[0])

mean = 0.0, variance = 1.0


We can see that there is zero missing values and values are properly scaled.

### Sea level pressure

In this preprocessing we fill in missing values using rolling average and scale values to zero mean unit variance.

In [15]:
# sea_level_pressure_feature = pd.DataFrame(sea_level_pressure_pipeline.fit_transform(train_weather))
# sea_level_pressure_feature.head()

In [16]:
# sea_level_pressure_feature[0].isna().sum()

In [17]:
# check_mean_and_variance(sea_level_pressure_feature[0])

### Wind speed

In this preprocessing we fill in missing values using rolling average and scale values to zero mean unit variance.

In [18]:
wind_speed_feature = pd.DataFrame(wind_speed_pipeline.fit_transform(train_weather))
wind_speed_feature.head()

Unnamed: 0,0
0,-1.524278
1,-0.881804
2,-1.524278
3,-1.524278
4,-0.410656


In [19]:
wind_speed_feature[0].isna().sum()

0

In [20]:
check_mean_and_variance(wind_speed_feature[0])

mean = -0.0, variance = 1.0


We can see that there is zero missing values and values are properly scaled.

### Wind direction

In this preprocessing we fill in missing values using rolling average and scale values to zero mean unit variance.

In [21]:
# wind_direction_feature = pd.DataFrame(wind_direction_pipeline.fit_transform(train_weather))
# wind_direction_feature.head()

In [22]:
# wind_direction_feature = pd.DataFrame(wind_direction_pipeline.fit_transform(train_weather))
# wind_direction_feature.head()

In [23]:
# wind_direction_feature[0].isna().sum()

In [24]:
# check_mean_and_variance(wind_direction_feature[0])