# Exploratory Data Analysis

In [1]:
# Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
sns.set()

## Importing data using Pandas

In [2]:
df_deli = pd.read_csv('Data/1.Data Exploration/delivery.csv', index_col='timestamp')
df_cons = pd.read_csv('Data/1.Data Exploration/consumption.csv', index_col='timestamp')
df_info = pd.read_csv('Data/1.Data Exploration/houses_info.csv', index_col='ID-nummer')

display(df_deli.head())
display(df_cons.head())
display(df_info.head())

Unnamed: 0_level_0,H01,H02,H03,H04,H05,H06,H07,H08,H09,H10,...,H24,H25,H26,H27,H28,H29,H30,H31,H32,H33
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-11 00:15:00,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 00:30:00,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 00:45:00,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 01:00:00,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 01:15:00,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,H01,H02,H03,H04,H05,H06,H07,H08,H09,H10,...,H24,H25,H26,H27,H28,H29,H30,H31,H32,H33
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-11 00:15:00,0.038,0.039,0.044,0.024,,0.039,0.025,0.006,0.294,0.036,...,0.053,0.018,0.076,0.07,0.036,0.014,0.044,0.416,0.023,0.053
2017-07-11 00:30:00,0.018,0.027,0.034,0.024,,0.051,0.017,0.007,0.165,0.025,...,0.049,0.019,0.083,0.061,0.027,0.026,0.034,0.17,0.031,0.029
2017-07-11 00:45:00,0.028,0.014,0.026,0.018,,0.076,0.018,0.006,0.107,0.03,...,0.043,0.018,0.292,0.054,0.018,0.011,0.038,0.075,0.025,0.028
2017-07-11 01:00:00,0.026,0.014,0.474,0.021,,0.08,0.016,0.006,0.105,0.036,...,0.058,0.019,0.049,0.034,0.022,0.026,0.032,0.099,0.024,0.041
2017-07-11 01:15:00,0.018,0.014,0.084,0.117,,0.438,0.023,0.006,0.103,0.015,...,0.045,0.018,0.048,0.208,0.015,0.017,0.043,0.054,0.025,0.032


Unnamed: 0_level_0,concept,PV-aantal,personen
ID-nummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H01,E,17,4
H02,E,14,2
H03,WP,9,4
H04,WP,11,1
H05,WP,12,4


## Data Cleaning

During the data exploration process, we found out the following problems:
1. There are houses with quite less data than the others.
2. There is missing data and outliers because of the malfunction of the smart meter.

We will handle these problems in this notebook.

### Handling outliers

First of all, we will set the outliers of our dataset to NaN values, which will be deleted later.

In order to find those outliers, we will follow some instructions:
* **Energy Delivery:** These solar panels are able to produce a maximum of 250 watts/hour (0.25 kwh) each one. Since the smart meter sums up all the energy delivery every 15 minutes, we will compute what is the maximum production for each dwelling taking into account the number of solar panels. Then, every value over the maximum will be set as an outlier.

\begin{align}
outlier > 0.25kwh * 0.25h * num\_solar\_panels
\end{align}

* **Energy Consumption:** STILL TO CHECK

\begin{align}
outlier > \frac{75A * 230V}{4000}
\end{align}

In order to check how many outliers will be dropped by the cleaning process, let's save first the number of NaN values on each dwelling.

In [3]:
deli_nan = df_deli.isna().sum()
cons_nan = df_cons.isna().sum()

#### Energy Delivery

In [4]:
for house in range(1,34):
    if house < 10:
        house_no = 'H0'+ str(house)
    else:
        house_no = 'H'+ str(house)
            
    df_deli.loc[df_deli[house_no] > (0.25 * 0.25 * df_info.loc['H01','PV-aantal']), house_no] = np.nan

#### Energy Consumption

In [5]:
for house in range(1,34):
    if house < 10:
        house_no = 'H0'+ str(house)
    else:
        house_no = 'H'+ str(house)
            
    df_cons.loc[df_cons[house_no] > (0.25 * 0.25 * df_info.loc['H01','PV-aantal']), house_no] = np.nan

The number of outliers on each dwelling are the following:

In [6]:
# Energy delivery
df_deli.isna().sum() - deli_nan

H01    6
H02    6
H03    5
H04    5
H05    4
H06    5
H07    8
H08    1
H09    6
H10    3
H11    5
H12    5
H13    5
H14    5
H15    7
H16    5
H17    4
H18    6
H19    8
H20    4
H21    6
H22    8
H23    5
H24    2
H25    6
H26    4
H27    4
H28    4
H29    9
H30    6
H31    6
H32    6
H33    6
dtype: int64

In [7]:
# Energy consumption
df_cons.isna().sum() - cons_nan

H01     957
H02     787
H03      28
H04       7
H05       8
H06     168
H07     919
H08     913
H09     891
H10    2127
H11      34
H12     341
H13     166
H14     670
H15      15
H16     114
H17      15
H18      25
H19      11
H20    2024
H21    2045
H22     944
H23       7
H24      42
H25    1700
H26      39
H27      10
H28    3894
H29     635
H30    1663
H31      12
H32    2743
H33       9
dtype: int64

Once we have set the outliers to NaN values, we will check how many outliers each dwelling has in both datasets.

In [8]:
df_deli.isna().sum() + df_cons.isna().sum()

H01     4731
H02     5107
H03     4233
H04     4308
H05    21270
H06     4317
H07     5249
H08     4172
H09     5019
H10    38790
H11    13173
H12     4300
H13     4201
H14    32290
H15     4444
H16     4367
H17     3803
H18     4317
H19     9543
H20     6218
H21     6173
H22     6234
H23     3948
H24     3464
H25     5286
H26     3625
H27    15720
H28     7508
H29     5156
H30    34963
H31     3802
H32     6533
H33     3977
dtype: int64

#### Dropping dwellings

As we can see above, there are some dweillings which have quite less data than the other ones. As we are going to delete every  record which contains at least one NaN, we will delete these dwellings from the dataset in order to not lose too much data. 

These dweillings will be the following:
* House 5
* House 10
* House 11
* House 14
* House 27
* House 30

In [9]:
df_deli = df_deli.drop(columns=['H05', 'H10', 'H11', 'H14', 'H27', 'H30'])
df_cons = df_cons.drop(columns=['H05', 'H10', 'H11', 'H14', 'H27', 'H30'])
df_info = df_info.drop(['H05', 'H10', 'H11', 'H14', 'H27', 'H30'])

In [10]:
display(df_deli.head())
display(df_cons.head())
display(df_info.head())

Unnamed: 0_level_0,H01,H02,H03,H04,H06,H07,H08,H09,H12,H13,...,H22,H23,H24,H25,H26,H28,H29,H31,H32,H33
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-11 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-11 01:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,H01,H02,H03,H04,H06,H07,H08,H09,H12,H13,...,H22,H23,H24,H25,H26,H28,H29,H31,H32,H33
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-11 00:15:00,0.038,0.039,0.044,0.024,0.039,0.025,0.006,0.294,0.137,0.058,...,0.024,0.115,0.053,0.018,0.076,0.036,0.014,0.416,0.023,0.053
2017-07-11 00:30:00,0.018,0.027,0.034,0.024,0.051,0.017,0.007,0.165,0.402,0.06,...,0.013,0.101,0.049,0.019,0.083,0.027,0.026,0.17,0.031,0.029
2017-07-11 00:45:00,0.028,0.014,0.026,0.018,0.076,0.018,0.006,0.107,0.019,0.363,...,0.025,0.108,0.043,0.018,0.292,0.018,0.011,0.075,0.025,0.028
2017-07-11 01:00:00,0.026,0.014,0.474,0.021,0.08,0.016,0.006,0.105,0.024,0.049,...,0.024,0.09,0.058,0.019,0.049,0.022,0.026,0.099,0.024,0.041
2017-07-11 01:15:00,0.018,0.014,0.084,0.117,0.438,0.023,0.006,0.103,0.013,0.198,...,0.014,0.246,0.045,0.018,0.048,0.015,0.017,0.054,0.025,0.032


Unnamed: 0_level_0,concept,PV-aantal,personen
ID-nummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H01,E,17,4
H02,E,14,2
H03,WP,9,4
H04,WP,11,1
H06,E,14,4


#### Deleting NaN values

In order to be able to compare the dwellings, we will delete those records (rows) where there is at least one NaN value.

In [11]:
display(df_deli.shape)
display(df_cons.shape)

(66240, 27)

(66240, 27)

In [12]:
df_deli.columns = df_deli.columns.map(lambda x: str(x) + '_d')
df_cons.columns = df_cons.columns.map(lambda x: str(x) + '_c')

df_full = df_deli.join(df_cons, how='outer')
df_full.dropna(inplace=True)

df_deli = df_full.loc[:, :'H33_d']
df_cons = df_full.loc[:, 'H01_c':]

df_deli.columns = df_deli.columns.map(lambda x: x[:3])
df_cons.columns = df_cons.columns.map(lambda x: x[:3])

In [13]:
display(df_deli.shape)
display(df_cons.shape)

(47709, 27)

(47709, 27)

### Exporting the datasets

Now that we have cleaned the datasets, we will export them to be used in the following steps.

In [14]:
df_deli.to_csv('Data/2.Data Cleaning/delivery.csv')
df_cons.to_csv('Data/2.Data Cleaning/consumption.csv')
df_info.to_csv('Data/2.Data Cleaning/houses_info.csv')