# Energy Data Parser
- Gets the energy data into a dataframe ready for analytics
- Does not perform any analysis or non-reversible mods

### Todo
- Convert missing cells (?) to NaN
- Investigate NaT indices -- fix if possible

### Scope for *next* notebook
- outliers
- trend analysis for bad points
- interpolate/resample for missing cells
- ???

In [72]:
# Preliminary setup
import pandas as pd
import zipfile
import os
dataname = 'household_power_consumption';
# There are c. 2.e6 rows in this file.
nskiprows = int(1.1e5)
#nskiprows = 0
readrows = int(1.e5)
#readrows = None # Comment out for smaller test load.

# Extract zip if necessary
if not os.path.isfile(dataname + '.txt'):
    zip_ref = zipfile.ZipFile('src/' + dataname +'.zip', 'r')
    zip_ref.extractall('.')
    zip_ref.close()

In [73]:
# Import into dataframe
csv_labels = ['Date','Time','Global_active_power','Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3'];
namelist = ['date','time','active','reactive','volts','amps','sub1','sub2','sub3'];

df = pd.read_csv(dataname + '.txt'
                ,delimiter=';'
                ,nrows=readrows
                ,skiprows=nskiprows
                ,header=0
                ,names=namelist
                ,parse_dates=[['date', 'time']]
                #,index_col='date_time'
                );

# Record the file line number of the row, and reindex on new datetime 
df['rowno'] = df.index + 2 + nskiprows
df.set_index('date_time',inplace=True,drop=True)
df.index.rename('dtime',inplace=True);

In [74]:
# Show rows with nulls
dfnulls = df[df.isnull().any(axis=1)];
dfnulls

Unnamed: 0_level_0,active,reactive,volts,amps,sub1,sub2,sub3,rowno
dtime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2007-03-25 17:52:00,?,?,?,?,?,?,,142590
2007-04-28 00:21:00,?,?,?,?,?,?,,190499
2007-04-28 00:22:00,?,?,?,?,?,?,,190500
2007-04-28 00:23:00,?,?,?,?,?,?,,190501
2007-04-28 00:24:00,?,?,?,?,?,?,,190502
2007-04-28 00:25:00,?,?,?,?,?,?,,190503
2007-04-28 00:26:00,?,?,?,?,?,?,,190504
2007-04-28 00:27:00,?,?,?,?,?,?,,190505
2007-04-28 00:28:00,?,?,?,?,?,?,,190506
2007-04-28 00:29:00,?,?,?,?,?,?,,190507


Unnamed: 0,date_time,active,reactive,volts,amps,sub1,sub2,sub3,rowno
0,NaT,0.218,0.000,242.660,0.800,0.000,0.000,,
1,NaT,0.218,0.000,243.140,0.800,0.000,0.000,,
2,NaT,0.218,0.000,243.300,0.800,0.000,0.000,,
3,NaT,0.218,0.000,243.410,0.800,0.000,0.000,,
4,NaT,0.218,0.000,243.240,0.800,0.000,0.000,,
5,NaT,0.220,0.000,244.170,0.800,0.000,0.000,,
6,NaT,0.220,0.000,244.010,0.800,0.000,0.000,,
7,NaT,0.218,0.000,242.990,0.800,0.000,0.000,,
8,NaT,0.220,0.000,243.340,0.800,0.000,0.000,,
9,NaT,0.220,0.000,244.020,0.800,0.000,0.000,,
