In [1]:
import os
import math
import pandas as pd
import numpy as np
from pandas import datetime
from matplotlib import pyplot as plt


In [2]:
# single household contsumption data extract function
def extract_house(data_frame, lclid):
    df = data_frame.set_index("tstp")
    df.index = df.index.astype("datetime64")
    
    df["energy(kWh/hh)"] = df["energy(kWh/hh)"].astype("float64")
    df = df[df["LCLid"] == "{}".format(lclid) ]
    return df

# weather extraction function
def extract_weather(data_frame):
    df = data_frame.set_index("time")
    df.index = df.index.astype("datetime64")
    return df

### Electricity consumption processing

In [3]:
elec = pd.read_csv("smart-meters-in-london/halfhourly_dataset/block_0.csv")
for num in range(1,111):    
    elec = elec.append(pd.read_csv("smart-meters-in-london/halfhourly_dataset/block_"+str(num)+".csv"))
        

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
elec.replace('Null', np.NaN, inplace=True)
energy = elec.iloc[:,:]
energy["energy"] = energy["energy(kWh/hh)"].astype("float64")
energy = energy.drop(["energy(kWh/hh)"],axis = 1)

**Missing values**

In [10]:
def missing_count(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: data[df_cols[i]].isnull().sum()})
    
    return dict_x


missing = missing_count(energy)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Count of missing data')
df_miss[0:10]


Count of missing data


[('energy', 5544), ('LCLid', 0), ('tstp', 0)]

### Single household

In [18]:
energy_MAC1 = energy[energy["LCLid"] == "MAC000010"]

In [19]:
missing = missing_count(energy_MAC1)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Count of missing data')
df_miss[0:10]

Count of missing data


[('energy', 1), ('LCLid', 0), ('tstp', 0)]

In [20]:
energy_MAC1["energy"] = energy_MAC1["energy"].astype("float64")
energy_MAC1["tstp"] = energy_MAC1["tstp"].astype("datetime64")
energy_MAC1 = energy_MAC1.groupby('tstp').agg({'energy': lambda x: x.mean(skipna=False)})
energy_MAC1['tstp'] = energy_MAC1.index
energy_MAC1.drop(['tstp'], axis = 1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### To prevent data leakage we split trian and test first

In [27]:
train_s = energy_MAC1.iloc[:20000, :]
test_s = energy_MAC1.iloc[20000:,:]

**then deal with missing value and resampling to 1hr**

In [28]:
train_s.energy = train_s.energy.fillna(train_s.median())
test_s.energy = test_s.energy.fillna(test_s.median())
train_s = train_s.resample('1H').sum()
test_s = test_s.resample('1H').sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


### Aggregated   household

**Group by LCLid and aggregate over per hour**


In [8]:
## deal with missing value by interpolating mean of whole column
# energy["energy"].fillna(energy["energy"].median(), inplace = True)
## group by time stamps and aggregate through mean
energy["energy"] = energy["energy"].astype("float64")
energy["tstp"] = energy["tstp"].astype("datetime64")
energy = energy.groupby('tstp').agg({'energy': lambda x: x.mean(skipna=False)})
energy['tstp'] = energy.index
energy.drop(['tstp'], axis = 1, inplace=True)

### To prevent data leakage we split trian and test first

In [28]:
train_e = energy.iloc[:28750, :]
test_e = energy.iloc[28750:,:]

**then deal with missing value and resampling to 1hr**

In [31]:
train_e.energy = train_e.energy.fillna(train_e.median())
test_e.energy = test_e.energy.fillna(test_e.median())
train_e = train_e.resample('1H').sum()
test_e = test_e.resample('1H').sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [35]:
## check if there is still missing value
missing = missing_count(test_e)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Count of missing data')
df_miss[0:10]


Count of missing data


[('energy', 0)]

### Weather data processing


In [34]:
weather = pd.read_csv('smart-meters-in-london/weather_hourly_darksky.csv')

In [35]:
## set index as time stamp
weather = weather.set_index("time")
weather.index = weather.index.astype("datetime64[ns]")

In [36]:
weather = weather.sort_index()

**Missing values**

In [38]:
def count_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().sum()*100,2)})
    
    return dict_x

missing = count_missing(weather)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:]

Percent of missing data


[('pressure', 1300),
 ('visibility', 0),
 ('windBearing', 0),
 ('temperature', 0),
 ('dewPoint', 0),
 ('apparentTemperature', 0),
 ('windSpeed', 0),
 ('precipType', 0),
 ('icon', 0),
 ('humidity', 0),
 ('summary', 0)]

In [39]:
##merge trian and test seperately
train =  weather.merge(train_s, left_index = True, right_index = True)
test = weather.merge(test_s.iloc[1:,:], left_index = True, right_index = True)

In [40]:
## fillna with median 
train["pressure"] = train["pressure"].fillna(train["pressure"].median())
test["pressure"] = test["pressure"].fillna(test["pressure"].median())

In [43]:
## data set without catagorical feature
train_nocat = train.drop(["precipType", "icon", "summary"],axis = 1)
test_nocat = test.drop(["precipType", "icon", "summary"],axis = 1)
train_nocat.to_csv("train_nocat_s.csv", index = True)
test_nocat.to_csv("test_nocat_s.csv", index = True)

**One hot encoding**

In [73]:
## Examine unique categories in categorical features
p = list(weather["precipType"].unique()) 
i = list(weather["icon"].unique()) 
u = list(weather["summary"].unique())
# Sort them for one hot encoding labels
p.sort()
i.sort()
u.sort()

In [74]:
from sklearn.preprocessing import OneHotEncoder
## One hot encoding these features above
onehot = OneHotEncoder(dtype=np.int, sparse=True)
nominals = pd.DataFrame(onehot.fit_transform(weather[['precipType', 'icon','summary']]).toarray(),columns=p+i+u)
nominals.index = weather.index

In [12]:
#Concate nominals with origin weather dataframe
weather = weather.set_index("time")
weather.index = weather.index.astype("datetime64")
#weather.drop(["precipType","icon","summary"],axis = 1, inplace = True)
weather =  weather.merge(nominals, left_index = True, right_index = True)


KeyError: "None of ['time'] are in the columns"

### Merge weather and energy data

In [14]:
### Merge weather and energy data together
weather_energy = weather.merge(energy, left_index = True, right_index = True)
weather_energy.index.names = ["time"]

In [15]:
##Split data into training set and testing set and save to csv file format
train_we = weather_energy.iloc[:14000,:]
test_we = weather_energy.iloc[14000:,:]
train_we.to_csv("train_auto.csv", index = True)
test_we.to_csv("test_auto.csv", index = True)

In [177]:
##Split data into training set and testing set and save to csv file format
train_we = weather_energy.iloc[:14000,:]
test_we = weather_energy.iloc[14000:,:]
train_we.to_csv("train.csv", index = True)
test_we.to_csv("test.csv", index = True)

In [176]:
weather_energy

Unnamed: 0_level_0,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,humidity,rain,snow,...,Clear,Foggy,Mostly Cloudy,Overcast,Partly Cloudy,Windy,Windy and Mostly Cloudy,Windy and Overcast,Windy and Partly Cloudy,energy
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-12-11 00:00:00,12.50,210,2.83,1.17,1015.67,1.11,1.78,0.89,1,0,...,0,0,0,0,1,0,0,0,0,0.498395
2011-12-11 01:00:00,12.65,204,2.48,0.81,1014.96,0.31,2.11,0.89,1,0,...,1,0,0,0,0,0,0,0,0,0.445535
2011-12-11 02:00:00,13.02,214,2.70,1.29,1014.42,0.11,2.57,0.90,1,0,...,1,0,0,0,0,0,0,0,0,0.349090
2011-12-11 03:00:00,13.05,211,3.47,1.41,1013.78,0.66,3.00,0.86,1,0,...,0,0,0,0,1,0,0,0,0,0.317786
2011-12-11 04:00:00,12.97,204,3.74,1.53,1012.94,1.29,2.64,0.85,1,0,...,0,0,0,0,1,0,0,0,0,0.281382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-02-15 19:00:00,12.68,247,7.39,3.06,997.07,3.91,6.08,0.74,1,0,...,0,0,0,0,1,0,0,0,0,0.744099
2014-02-15 20:00:00,13.78,249,6.56,2.76,998.15,3.03,5.61,0.77,1,0,...,1,0,0,0,0,0,0,0,0,0.685877
2014-02-15 21:00:00,14.31,248,6.47,2.71,999.28,3.06,5.25,0.77,1,0,...,1,0,0,0,0,0,0,0,0,0.634185
2014-02-15 22:00:00,14.31,248,5.96,2.69,1000.33,2.68,4.69,0.80,1,0,...,1,0,0,0,0,0,0,0,0,0.574454
