# Data preparation notebook

<ol>
<li> uses the half hourly data </li> 
<li> sorts each block of the  dataset by time and fills forward the hourly weather data </li>
<li> converts time stamp columns minute of day,  day of week, month etc </li>      
<li> converts categorical columns to ints  (not one hot encoded ).  The mapping is persisted in nested dictionary called `cat_map.pkl`</li>   
<li> each block file after being processes with the above feature engineering is appended to a bcolz array </li>     
</ol>


In [24]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from datetime import datetime
import bcolz
import shutil
import pickle

In [2]:
label_col = 'energy(kWh/hh)'
categorical_cols = ['LCLid', 'stdorToU', 'Acorn_grouped', 'Acorn', 'summary', 'h_summary', 'icon', 'h_icon', 'h_precipType']
numeric_cols = ['h_visibility','h_windBearing','h_temperature','h_dewPoint','h_pressure','h_apparentTemperature'
,'h_windSpeed','h_humidity','month_x','dayofweek','isHoliday','halfhourofday','temperatureMax'
,'windBearing','dewPoint','cloudCover','windSpeed','pressure','apparentTemperatureHigh','visibility','humidity','apparentTemperatureLow'
,'apparentTemperatureMax','uvIndex','temperatureLow','temperatureMin','temperatureHigh','apparentTemperatureMin'
,'moonPhase','temperatureMinTime_mod','temperatureMaxTime_mod','apparentTemperatureMinTime_mod','apparentTemperatureMaxTime_mod'
,'temperatureHighTime_mod','temperatureLowTime_mod','apparentTemperatureHighTime_mod','apparentTemperatureLowTime_mod'
,'sunsetTime_mod','sunriseTime_mod','uvIndexTime_mod']
allcols = np.concatenate([categorical_cols ,  numeric_cols ]).tolist()
allcols.append(label_col)
cat_map= dict()
for c in categorical_cols:
    cat_map[c]=dict()

In [3]:
def getvalue_for_column(colname,value):
    global cat_map
    if not value in cat_map[colname].keys():
        td = cat_map[colname]
        cnt = len(td.values())
        td[value]=cnt+1
        cat_map[colname]=td
    return cat_map[colname][value]

In [4]:
def half_hour_of_day(datestring, formatstr='%Y-%m-%d %H:%M:%S.%f', stripChars=6):
    if stripChars>0:
        tmp = datetime.strptime(datestring[:-stripChars], formatstr)
    else:
        tmp = datetime.strptime(datestring, formatstr)
    return 2*(tmp.hour + (tmp.minute /60.0))

In [5]:
def minute_of_day(datestring, formatstr='%Y-%m-%d %H:%M:%S.%f', stripChars=6):
    if type(datestring)==float:
        return np.nan
    if stripChars>0:
        tmp = datetime.strptime(datestring[:-stripChars], formatstr)
    else:
        tmp = datetime.strptime(datestring, formatstr)
    return ((tmp.hour*60) + tmp.minute )

In [6]:
def get_ts_int(datestring, formatstr='%Y-%m-%d %H:%M:%S.%f', stripChars=6):
    if type(datestring)==float:
        return np.nan
    if stripChars>0:
        tmp = datetime.strptime(datestring[:-stripChars], formatstr)
    else:
        tmp = datetime.strptime(datestring, formatstr)
    start = datetime(2000,1, 1 )
    delta = tmp-start
    return delta.days*24*3600 + delta.seconds

In [7]:
def convert_time_cols(data_frame, column_name):
    data_frame[column_name+ "_mod"] = data_frame[column_name].apply(lambda x: minute_of_day(x, formatstr='%Y-%m-%d %H:%M:%S', stripChars=0) )

In [8]:
DATA_ROOT = "data"
BLOCK_PATH = os.path.join(DATA_ROOT,'halfhourly_dataset') 
BLOCKS = os.listdir(BLOCK_PATH)

In [47]:
def persist_bcolz(chunk_number, data, processed_data_dir):
    global da
    if chunk_number == 0:
        if os.path.isdir(processed_data_dir):
            shutil.rmtree(processed_data_dir)
        da = bcolz.carray(data, rootdir=processed_data_dir)
        #da.flush()
    else: 
        #da = bcolz.open(rootdir=processed_data_dir, mode='w')
        da.append(data)
        #da.flush()    

In [10]:
hourly_weather = pd.read_csv(os.path.join(DATA_ROOT, "weather_hourly_darksky.csv"))
daily_weather  = pd.read_csv(os.path.join(DATA_ROOT, "weather_daily_darksky.csv"))
house          = pd.read_csv(os.path.join(DATA_ROOT, "informations_households.csv"))
holidays       = pd.read_csv(os.path.join(DATA_ROOT, "uk_bank_holidays.csv"))

In [11]:
hourly_weather.columns = ["h_"+ c for c in hourly_weather.columns]
hourly_weather.rename(columns={"h_time":"time"}, inplace=True)
hols = holidays['Bank holidays'].values

In [12]:
daily_weather['month']=[ int(v[5:7]) for v in daily_weather['time'].values]
for c in ['temperatureMinTime', 'temperatureMaxTime', 'apparentTemperatureMinTime','apparentTemperatureMaxTime','temperatureHighTime','temperatureLowTime','apparentTemperatureHighTime','apparentTemperatureLowTime','sunsetTime','sunriseTime','uvIndexTime']:
    convert_time_cols(daily_weather, c)
daily_weather['date']=[ v[:10] for v in daily_weather['temperatureMinTime'].values]  
daily_weather.fillna(method='ffill', inplace=True)    

In [13]:
## per block operations
def feature_eng(df):
    df['time']=  df['tstp'].astype(str).str[:19]
    df = pd.merge(df, house, on='LCLid', how='left')
    df = pd.merge(df, hourly_weather, on='time',  how='left')
    df['month']=[ int(v[5:7]) for v in df['tstp'].values]
    df['dayofweek']=[datetime.strptime(v[:-6], '%Y-%m-%d %H:%M:%S.%f').weekday()  for v in df['tstp'].values]
    df['isHoliday']=[v[:10] in hols  for v in df['tstp'].values]
    df['halfhourofday'] = df.tstp.apply(lambda x: half_hour_of_day(x) )
    df['date']=[ v[:10] for v in df['tstp'].values] 
    df['ts_int'] = df.tstp.apply(lambda x: get_ts_int(x))
    df = pd.merge(df, daily_weather, on='date', how='left')
    df.sort_values(by='ts_int', ascending=True, inplace=True)
    hcols =['h_visibility', 'h_windBearing', 'h_temperature','h_dewPoint', 'h_pressure', 'h_apparentTemperature', 'h_windSpeed',
       'h_precipType', 'h_icon', 'h_humidity', 'h_summary']
    df[hcols]= df[hcols].ffill()
    df[label_col] = pd.to_numeric(df[label_col], errors='coerce')
    for c in categorical_cols:  # convert cat columns to ints
        df[c] = df[c].apply(lambda x: getvalue_for_column(c,x))
    return df[allcols]

In [51]:
for i, block in enumerate (BLOCKS):
    print ("starting  block ", i)
    ddf = pd.read_csv(os.path.join(BLOCK_PATH,block))
    df = feature_eng(ddf)
    persist_bcolz(i,df[allcols].values.astype(np.float32), 'data/processed_main')
    del ddf
    del df
    print ("completed block ", i)    
da.flush()
# save the string mapping dictionary    
with open("data/cat_map.pkl", "wb") as output_file:
       pickle.dump(cat_map, output_file)

starting  block  0
completed block  0
starting  block  1
completed block  1
starting  block  2
completed block  2
starting  block  3
completed block  3
starting  block  4
completed block  4
starting  block  5
completed block  5
starting  block  6
completed block  6
starting  block  7
completed block  7
starting  block  8
completed block  8
starting  block  9
completed block  9
starting  block  10
completed block  10
starting  block  11
completed block  11
starting  block  12
completed block  12
starting  block  13
completed block  13
starting  block  14
completed block  14
starting  block  15
completed block  15
starting  block  16
completed block  16
starting  block  17
completed block  17
starting  block  18
completed block  18
starting  block  19
completed block  19
starting  block  20
completed block  20
starting  block  21
completed block  21
starting  block  22
completed block  22
starting  block  23
completed block  23
starting  block  24
completed block  24
starting  block  25


  interactivity=interactivity, compiler=compiler, result=result)


completed block  29
starting  block  30
completed block  30
starting  block  31
completed block  31
starting  block  32
completed block  32
starting  block  33
completed block  33
starting  block  34
completed block  34
starting  block  35
completed block  35
starting  block  36
completed block  36
starting  block  37
completed block  37
starting  block  38
completed block  38
starting  block  39
completed block  39
starting  block  40
completed block  40
starting  block  41
completed block  41
starting  block  42
completed block  42
starting  block  43
completed block  43
starting  block  44
completed block  44
starting  block  45
completed block  45
starting  block  46
completed block  46
starting  block  47
completed block  47
starting  block  48
completed block  48
starting  block  49
completed block  49
starting  block  50
completed block  50
starting  block  51
completed block  51
starting  block  52
completed block  52
starting  block  53
completed block  53
starting  block  54


In [40]:
da = bcolz.open(rootdir='data/processed_main', mode='r')

In [53]:
len(da)

171240269