# Energy Data Parser
- Gets the energy data into a dataframe ready for analytics
- Does not perform any analysis or non-reversible mods

### Todo
- Convert missing cells (?) to NaN
- ~~Investigate NaT indices -- fix if possible~~
    - Times are reported in DD/MM/YYYY format, and are strictly reported every minute over the entire dataset

In [1]:
# Preliminary setup
import pandas as pd
import zipfile
import os
import numpy as np

dataname = 'household_power_consumption'
# There are c. 2.e6 rows in this file.
#nskiprows = int(1.1e5)
nskiprows = 0
#readrows = int(1.e5)
readrows = None # Comment out for smaller test load.

In [2]:
# Extract zip if necessary
if not os.path.isfile(dataname + '.txt'):
    zip_ref = zipfile.ZipFile('src/' + dataname +'.zip', 'r')
    zip_ref.extractall('.')
    zip_ref.close()

In [3]:
# Import into dataframe
csv_labels = ['Date','Time','Global_active_power','Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3']
namelist = ['date','time','active','reactive','volts','amps','sub1','sub2','sub3']
lowmem=not(nskiprows == 0 or readrows is None)

df = pd.read_csv(dataname + '.txt'
                ,delimiter=';'
                ,nrows=readrows
                ,skiprows=nskiprows
                ,header=0
                ,names=namelist
                ,parse_dates=[['date', 'time']]
                ,dayfirst=True
                ,low_memory=lowmem
                ,na_values='?'
                #,index_col='date_time'
                )

In [4]:
#df.replace('?',np.NaN,inplace=True)

In [5]:
# Non-destructive formatting and unit matching

#1. Record the file line number of the row, and reindex on new datetime 
# df['rowno'] = df.index + 2 + nskiprows
df.set_index('date_time',inplace=True,drop=True)
df.index.rename('dtime',inplace=True)

In [8]:
# Subs 1,2,3 are in Wh/min -- match with active power (kW) by multiplying by 60/1000
#df = pd.DataFrame(df2)
df['sub1'] = df.sub1*60/1000
df['sub2'] = df.sub2*60/1000
df['sub3'] = df.sub2*60/1000

In [9]:
# Store it for later cleanup
df.to_pickle("alldata.pickle")