In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm

# Look at how 'clean' each CSV file download from Kaggle is

# 1) 'daily_dataset'

In [3]:
df_daily = pd.read_csv("../Data/daily_dataset.csv")

In [4]:
df_daily.head()

Unnamed: 0,LCLid,day,energy_median,energy_mean,energy_max,energy_count,energy_std,energy_sum,energy_min
0,MAC000131,2011-12-15,0.485,0.432045,0.868,22,0.239146,9.505,0.072
1,MAC000131,2011-12-16,0.1415,0.296167,1.116,48,0.281471,14.216,0.031
2,MAC000131,2011-12-17,0.1015,0.189812,0.685,48,0.188405,9.111,0.064
3,MAC000131,2011-12-18,0.114,0.218979,0.676,48,0.202919,10.511,0.065
4,MAC000131,2011-12-19,0.191,0.325979,0.788,48,0.259205,15.647,0.066


Look at null values

In [5]:
df_daily.isnull().sum()

LCLid                0
day                  0
energy_median       30
energy_mean         30
energy_max          30
energy_count         0
energy_std       11331
energy_sum          30
energy_min          30
dtype: int64

In [6]:
df_daily[df_daily["energy_median"].isna()]

Unnamed: 0,LCLid,day,energy_median,energy_mean,energy_max,energy_count,energy_std,energy_sum,energy_min
37231,MAC000410,2012-12-18,,,,0,,,
129012,MAC005560,2012-12-19,,,,0,,,
138461,MAC002110,2012-12-18,,,,0,,,
205939,MAC001065,2012-12-18,,,,0,,,
293243,MAC001229,2012-12-18,,,,0,,,
295223,MAC001278,2012-12-18,,,,0,,,
692446,MAC001478,2013-03-08,,,,0,,,
867057,MAC005558,2012-12-19,,,,0,,,
902532,MAC000393,2012-12-18,,,,0,,,
913986,MAC002014,2012-12-18,,,,0,,,


In [7]:
len(df_daily[df_daily["energy_median"].isna()])

30

It appears all the null values for 'energy_median', 'energy_mean', 'energy_max', 'energy_sum', 'energy_min' occur in the same 30 rows. It looks like no measurements were taken on these occasions, so we will drop these rows.

(It also looks like many of those occured on the same dates (Dec 18/19 2012), so could be something worth looking into further.)

In [8]:
df_daily = df_daily.dropna(subset=["energy_median"])

In [9]:
df_daily.isnull().sum()

LCLid                0
day                  0
energy_median        0
energy_mean          0
energy_max           0
energy_count         0
energy_std       11301
energy_sum           0
energy_min           0
dtype: int64

That leaves us with only the 'energy_std' column that contains null values.

In [10]:
df_daily[df_daily["energy_std"].isna()]

Unnamed: 0,LCLid,day,energy_median,energy_mean,energy_max,energy_count,energy_std,energy_sum,energy_min
806,MAC000131,2014-02-28,0.075,0.075,0.075,1,,0.075,0.075
1613,MAC000132,2014-02-28,0.049,0.049,0.049,1,,0.049,0.049
2434,MAC000221,2014-02-28,0.592,0.592,0.592,1,,0.592,0.592
3255,MAC000228,2014-02-28,0.039,0.039,0.039,1,,0.039,0.039
4076,MAC000234,2014-02-28,0.071,0.071,0.071,1,,0.071,0.071
...,...,...,...,...,...,...,...,...,...
3507356,MAC004926,2014-02-28,0.033,0.033,0.033,1,,0.033,0.033
3508127,MAC004932,2014-02-28,0.177,0.177,0.177,1,,0.177,0.177
3508898,MAC004937,2014-02-28,0.084,0.084,0.084,1,,0.084,0.084
3509666,MAC004965,2014-02-28,0.618,0.618,0.618,1,,0.618,0.618


It looks as if the 'energy_std' is null on these occasions because there was only 1 measurement taken, therefore there isn't a standard deviation value.

# 2) acorn_details

In [11]:
df_acorn = pd.read_csv("../Data/acorn_details.csv", engine='python')

In [12]:
df_acorn.head()

Unnamed: 0,MAIN CATEGORIES,CATEGORIES,REFERENCE,ACORN-A,ACORN-B,ACORN-C,ACORN-D,ACORN-E,ACORN-F,ACORN-G,ACORN-H,ACORN-I,ACORN-J,ACORN-K,ACORN-L,ACORN-M,ACORN-N,ACORN-O,ACORN-P,ACORN-Q
0,POPULATION,Age,Age 0-4,77.0,83.0,72.0,100.0,120.0,77.0,97.0,97.0,63.0,119.0,67.0,114.0,113.0,89.0,123.0,138.0,133.0
1,POPULATION,Age,Age 5-17,117.0,109.0,87.0,69.0,94.0,95.0,102.0,106.0,67.0,95.0,64.0,108.0,116.0,86.0,89.0,136.0,106.0
2,POPULATION,Age,Age 18-24,64.0,73.0,67.0,107.0,100.0,71.0,83.0,89.0,62.0,104.0,459.0,97.0,96.0,86.0,117.0,109.0,110.0
3,POPULATION,Age,Age 25-34,52.0,63.0,62.0,197.0,151.0,66.0,90.0,88.0,63.0,132.0,145.0,109.0,96.0,90.0,140.0,120.0,120.0
4,POPULATION,Age,Age 35-49,102.0,105.0,91.0,124.0,118.0,93.0,102.0,103.0,76.0,111.0,67.0,99.0,98.0,90.0,102.0,103.0,100.0


In [13]:
df_acorn.isnull().sum()

MAIN CATEGORIES    0
CATEGORIES         0
REFERENCE          0
ACORN-A            0
ACORN-B            0
ACORN-C            0
ACORN-D            0
ACORN-E            0
ACORN-F            0
ACORN-G            0
ACORN-H            0
ACORN-I            0
ACORN-J            0
ACORN-K            0
ACORN-L            0
ACORN-M            0
ACORN-N            0
ACORN-O            0
ACORN-P            0
ACORN-Q            0
dtype: int64

No null values - WINNER!

# 3) informations_households

In [14]:
df_household = pd.read_csv("../Data/informations_households.csv")

In [15]:
df_household.head()

Unnamed: 0,LCLid,stdorToU,Acorn,Acorn_grouped,file
0,MAC005492,ToU,ACORN-,ACORN-,block_0
1,MAC001074,ToU,ACORN-,ACORN-,block_0
2,MAC000002,Std,ACORN-A,Affluent,block_0
3,MAC003613,Std,ACORN-A,Affluent,block_0
4,MAC003597,Std,ACORN-A,Affluent,block_0


In [16]:
df_household.isnull().sum()

LCLid            0
stdorToU         0
Acorn            0
Acorn_grouped    0
file             0
dtype: int64

No null values - beautiful clean data!

# 4) uk_bank_holidays

In [17]:
df_holiday = pd.read_csv("../Data/uk_bank_holidays.csv")

In [18]:
df_holiday.head()

Unnamed: 0,Bank holidays,Type
0,2012-12-26,Boxing Day
1,2012-12-25,Christmas Day
2,2012-08-27,Summer bank holiday
3,2012-05-06,Queen?s Diamond Jubilee (extra bank holiday)
4,2012-04-06,Spring bank holiday (substitute day)


In [19]:
df_holiday.isnull().sum()

Bank holidays    0
Type             0
dtype: int64

No null values again...yummy!

# 5) weather_daily_darksky

In [20]:
df_weather = pd.read_csv("../Data/weather_daily_darksky.csv")

In [21]:
df_weather.head()

Unnamed: 0,temperatureMax,temperatureMaxTime,windBearing,icon,dewPoint,temperatureMinTime,cloudCover,windSpeed,pressure,apparentTemperatureMinTime,...,temperatureHigh,sunriseTime,temperatureHighTime,uvIndexTime,summary,temperatureLowTime,apparentTemperatureMin,apparentTemperatureMaxTime,apparentTemperatureLowTime,moonPhase
0,11.96,2011-11-11 23:00:00,123,fog,9.4,2011-11-11 07:00:00,0.79,3.88,1016.08,2011-11-11 07:00:00,...,10.87,2011-11-11 07:12:14,2011-11-11 19:00:00,2011-11-11 11:00:00,Foggy until afternoon.,2011-11-11 19:00:00,6.48,2011-11-11 23:00:00,2011-11-11 19:00:00,0.52
1,8.59,2011-12-11 14:00:00,198,partly-cloudy-day,4.49,2011-12-11 01:00:00,0.56,3.94,1007.71,2011-12-11 02:00:00,...,8.59,2011-12-11 07:57:02,2011-12-11 14:00:00,2011-12-11 12:00:00,Partly cloudy throughout the day.,2011-12-12 07:00:00,0.11,2011-12-11 20:00:00,2011-12-12 08:00:00,0.53
2,10.33,2011-12-27 02:00:00,225,partly-cloudy-day,5.47,2011-12-27 23:00:00,0.85,3.54,1032.76,2011-12-27 22:00:00,...,10.33,2011-12-27 08:07:06,2011-12-27 14:00:00,2011-12-27 00:00:00,Mostly cloudy throughout the day.,2011-12-27 23:00:00,5.59,2011-12-27 02:00:00,2011-12-28 00:00:00,0.1
3,8.07,2011-12-02 23:00:00,232,wind,3.69,2011-12-02 07:00:00,0.32,3.0,1012.12,2011-12-02 07:00:00,...,7.36,2011-12-02 07:46:09,2011-12-02 12:00:00,2011-12-02 10:00:00,Partly cloudy throughout the day and breezy ov...,2011-12-02 19:00:00,0.46,2011-12-02 12:00:00,2011-12-02 19:00:00,0.25
4,8.22,2011-12-24 23:00:00,252,partly-cloudy-night,2.79,2011-12-24 07:00:00,0.37,4.46,1028.17,2011-12-24 07:00:00,...,7.93,2011-12-24 08:06:15,2011-12-24 15:00:00,2011-12-24 13:00:00,Mostly cloudy throughout the day.,2011-12-24 19:00:00,-0.51,2011-12-24 23:00:00,2011-12-24 20:00:00,0.99


In [22]:
df_weather.isnull().sum()

temperatureMax                 0
temperatureMaxTime             0
windBearing                    0
icon                           0
dewPoint                       0
temperatureMinTime             0
cloudCover                     1
windSpeed                      0
pressure                       0
apparentTemperatureMinTime     0
apparentTemperatureHigh        0
precipType                     0
visibility                     0
humidity                       0
apparentTemperatureHighTime    0
apparentTemperatureLow         0
apparentTemperatureMax         0
uvIndex                        1
time                           0
sunsetTime                     0
temperatureLow                 0
temperatureMin                 0
temperatureHigh                0
sunriseTime                    0
temperatureHighTime            0
uvIndexTime                    1
summary                        0
temperatureLowTime             0
apparentTemperatureMin         0
apparentTemperatureMaxTime     0
apparentTe

In [23]:
df_weather[df_weather["uvIndex"].isna()]

Unnamed: 0,temperatureMax,temperatureMaxTime,windBearing,icon,dewPoint,temperatureMinTime,cloudCover,windSpeed,pressure,apparentTemperatureMinTime,...,temperatureHigh,sunriseTime,temperatureHighTime,uvIndexTime,summary,temperatureLowTime,apparentTemperatureMin,apparentTemperatureMaxTime,apparentTemperatureLowTime,moonPhase
846,11.34,2014-01-01 20:00:00,187,wind,6.78,2014-01-01 02:00:00,,7.2,993.32,2014-01-01 02:00:00,...,10.83,2014-01-01 08:07:22,2014-01-01 19:00:00,,Breezy until evening.,2014-01-02 08:00:00,3.18,2014-01-01 20:00:00,2014-01-02 08:00:00,0.01


Only 3 null values, and they are all on the same date (2014-01-01).

The 'time' column in this file is a datetimestamp, would be useful if it was only date so that it matches with the other CSV files.

In [24]:
df_weather["time"] = pd.to_datetime(df_weather["time"])

In [25]:
df_weather["time"]

0     2011-11-11
1     2011-12-11
2     2011-12-27
3     2011-12-02
4     2011-12-24
         ...    
877   2014-01-26
878   2014-02-27
879   2014-03-09
880   2014-02-12
881   2014-02-15
Name: time, Length: 882, dtype: datetime64[ns]

# Export amended files

In [28]:
df_daily = df_daily.to_csv("../Data/daily_dataset_clean.csv", index=False)
df_weather = df_weather.to_csv("../Data/weather_daily_darksky_clean.csv", index=False)