In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
def clean_datetime(df):
    """
    Input should be a df with a column called 'datetime'.
    This function checks wether a row in the df.datetime column can be parsed to a pandas datetime object,
    by trying pd.to_datetime() on it.
    If it fails it will replace that row with np.nan().
    Finally this function will return the df with the NaN rows dropped.
    It only drops the row if the datetime column contains a NaN.
    """
    for i in range(len(df)):
        try:
            pd.to_datetime(df.datetime[i])
        except ValueError:
            print('-----')
            print('ValueError at index = %s' % i)
            print(df.datetime[i])
            df.datetime = df.datetime.replace(df.datetime[i], np.nan)
    df = df.dropna(subset = ['datetime'])
    return df

In [3]:
weather=pd.read_csv('//datc//opschaler//weather_data//weather.csv',delimiter='\t',comment='#',parse_dates=['datetime'])
weather=weather.set_index(['datetime'])
weather.head()

Unnamed: 0_level_0,DD,DR,FF,FX,N,P,Q,RG,SQ,T,T10,TD,U,VV,WW
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-01-01 00:20:00,200.7,0.0,4.97,6.9,0.0,1033.9,0.0,0.0,0.0,3.2,2.3,1.0,85.0,3650.0,10.0
2015-01-01 00:20:10,200.838333,0.0,4.967833,6.899,0.0,1033.906667,0.0,0.0,0.0,3.198333,2.296667,0.998333,85.0,3641.683333,10.0
2015-01-01 00:20:20,200.976667,0.0,4.965667,6.898,0.0,1033.913333,0.0,0.0,0.0,3.196667,2.293333,0.996667,85.0,3633.366667,10.0
2015-01-01 00:20:30,201.115,0.0,4.9635,6.897,0.0,1033.92,0.0,0.0,0.0,3.195,2.29,0.995,85.0,3625.05,10.0
2015-01-01 00:20:40,201.253333,0.0,4.961333,6.896,0.0,1033.926667,0.0,0.0,0.0,3.193333,2.286667,0.993333,85.0,3616.733333,10.0


In [4]:
#READING ALL EXCELS
path='/datc/opschaler/smartmeter_data'
filenames = glob.glob(path + "/*.csv")

In [5]:
dfs_smart=[]
dfs_gas=[]

for f in filenames:
    df = pd.read_csv(f,delimiter=';',header=0)
    
    smart=df.iloc[:,:7]
    gas=df.iloc[:,7:]
    
    smart = smart.rename(index=str,columns={"Timestamp":"datetime"})
    gas = gas.rename(index=str,columns={"gasTimestamp":"datetime"})

    smart = clean_datetime(smart)
    gas = clean_datetime(gas)
    
    smart['datetime'] = pd.to_datetime(smart['datetime'])
    gas['datetime'] = pd.to_datetime(gas['datetime'])

    
    smart=smart.set_index(['datetime'])
    gas=gas.set_index(['datetime'])
    
    dfs_smart.append(smart)
    dfs_gas.append(gas)

print('-----FINISHED-----')

-----
ValueError at index = 179023
<br />
-----
ValueError at index = 179024
<b>Fatal error</b>:  Allowed memory size of 134217728 bytes exhausted (tried to allocate 32 bytes) in <b>D:\wamp\www\opschaler\downloaddata.php</b> on line <b>21</b><br />
-----FINISHED-----


In [6]:
dfs_smart[0].dtypes

eMeter             float64
eMeterReturn       float64
eMeterLow          float64
eMeterLowReturn    float64
ePower             float64
ePowerReturn       float64
dtype: object

In [7]:
"""
NaN is a missing row()
"""

t = dfs_smart[3].resample('10s').mean()
t.head()

Unnamed: 0_level_0,eMeter,eMeterReturn,eMeterLow,eMeterLowReturn,ePower,ePowerReturn
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-03-07 16:45:30,4108.425,0.0,4623.579,0.0,190.0,0.0
2017-03-07 16:45:40,4108.425,0.0,4623.579,0.0,191.0,0.0
2017-03-07 16:45:50,4108.426,0.0,4623.579,0.0,99.0,0.0
2017-03-07 16:46:00,4108.426,0.0,4623.579,0.0,101.0,0.0
2017-03-07 16:46:10,4108.426,0.0,4623.579,0.0,97.0,0.0


In [14]:
dfs_smart_resampled = []
dfs_gas_resampled = []

for smart in dfs_smart:
    smart = smart.resample('10s').mean()
    dfs_smart_resampled.append(smart)

    
for gas in dfs_gas:
    gas = gas.resample('H').mean()
    gas = gas.resample('10s').interpolate(method='time')
    
    gas['gasPower'] = gas['gasMeter'].diff()
    #gas['gasPower'][0] = gas['gasPower'][1]
    
    dfs_gas_resampled.append(gas)

In [39]:
dfs_merged = []

for i in range(len(dfs_smart_resampled)):
    smart = dfs_smart_resampled[i]
    gas = dfs_gas_resampled[i]

    df = pd.merge(smart, gas, left_index=True, right_index=True)
    df = pd.merge(df, weather, left_index=True, right_index=True)
    dfs_merged.append(df)

In [None]:
"""
Finally save the merged dataframes to dir/dwelling_id.csv
"""
