# Energy Data Cleanup

This holds any potentially destructive cleanup -- massaging data to make it fit our expectations.

Cleaups include:

1. Completely null rows are dropped.
2. [in progress] gaps under 5 minutes are linearly interpolated

In [120]:
# Preliminary setup
import pandas as pd
import numpy as np
import zipfile
import os
import matplotlib.pyplot as plt

In [121]:
df = pd.read_pickle('alldata.pickle')

In [122]:
# Localize index
d = df.tz_convert('Europe/Paris',copy=False)

# Create artificial 'house' column 
subs = df[['sub1','sub2','sub3']].sum(axis=1)
df['house'] = df.active - subs

In [123]:
# Interpolate values for all gaps <= 5 minutes
idx = df.index.to_series()
timedeltas = idx - idx.shift(1) # At each row, find the time since the previous row
smallgaps = (timedeltas <= pd.Timedelta('5 minutes')) & (timedeltas > pd.Timedelta('1 minute'))

In [124]:
# Deal with gaps
inter = df.interpolate(method='time',limit=5)

# Remove rows that still have NaN
df.dropna(how='all',inplace=True)

In [125]:
df.to_pickle("alldata_clean.pickle")