In [None]:
import pandas as pd
import numpy as np
import glob

In [5]:
def clean_datetime(df):
    """
    Input should be a df with a column called 'datetime'.
    This function checks wether a row in the df.datetime column can be parsed to a pandas datetime object,
    by trying pd.to_datetime() on it.
    If it fails it will replace that row with np.nan().
    Finally this function will return the df with the NaN rows dropped.
    It only drops the row if the datetime column contains a NaN.
    """
    for i in range(len(df)):
        try:
            pd.to_datetime(df.datetime[i])
        except ValueError:
            print('-----')
            print('ValueError at index = %s' % i)
            print(df.datetime[i])
            df.datetime = df.datetime.replace(df.datetime[i], np.nan)
    df = df.dropna(subset = ['datetime'])
    return df

In [22]:
def clean_prepare_smart_gas(file_path):
    """
    Input is a dwelling_id.csv file.
    Output are cleaned & prepared dataframes (smart, gas).
    Return: smart, gas
    """
    df = pd.read_csv(file_path, delimiter=';', header=0)
    smart = df.iloc[:,:7]
    gas = df.iloc[:,7:]
    
    smart = smart.rename(index=str,columns={"Timestamp":"datetime"})
    gas = gas.rename(index=str,columns={"gasTimestamp":"datetime"})

    smart = clean_datetime(smart)
    gas = clean_datetime(gas)
    
    smart['datetime'] = pd.to_datetime(smart['datetime'])
    gas['datetime'] = pd.to_datetime(gas['datetime'])

    
    smart = smart.set_index(['datetime'])
    gas = gas.set_index(['datetime'])

    return smart, gas

In [26]:
def resample_smart_gas(smart, gas):
    """
    Resamples the (smart, gas) dfs to 10s.
    Also calculates gasPower. 
    Returns (smart_resampled, gas_resampled)
    """
    smart_resampled = smart.resample('10s').mean()
    
    gas_resampled = gas.resample('H').mean()
    # replace 0s with NaNs
    gas_resampled = gas_resampled.resample('10s').interpolate(method='time')
    gas_resampled['gasPower'] = gas_resampled['gasMeter'].diff()
    
    return smart_resampled, gas_resampled

In [27]:
def merge_smart_gas_weather(smart_resampled, gas_resampled, weather):
    """
    Merges the dataframes, outputs one df.
    """
    df = pd.merge(smart, gas, left_index=True, right_index=True)
    df = pd.merge(df, weather, left_index=True, right_index=True)
    
    return df

In [42]:
def save_df(df, dwelling_id):
    dir = '//datc//opschaler//combined_dfs_gas_smart_weather//'
    df.to_csv(dir+dwelling_id+'_merged_gas_smart_weather.csv', sep='\t', index=True)
    print('Saved %s' % dwelling_id)

In [None]:
def main():
    for i,file_path in enumerate(file_paths):
        dwelling_id = file_paths[i][-15:-4]
        smart, gas = clean_prepare_smart_gas(file_paths[i])
        smart_resampled, gas_resampled = resample_smart_gas(smart, gas)
        df = merge_smart_gas_weather(smart_resampled, gas_resampled, weather)
        print('Finished iteration: %s' % i)

In [None]:
weather=pd.read_csv('//datc//opschaler//weather_data//weather.csv',delimiter='\t',comment='#',parse_dates=['datetime'])
weather=weather.set_index(['datetime'])

In [52]:
path='/datc/opschaler/smartmeter_data'
file_paths = glob.glob(path + "/*.csv")

file_paths = file_paths[:2]
file_paths

['/datc/opschaler/smartmeter_data/P02S01W0998.csv',
 '/datc/opschaler/smartmeter_data/P01S01W0373.csv']

In [53]:
def main():
    for i,file_path in enumerate(file_paths):
        dwelling_id = file_paths[i][-15:-4]
        smart, gas = clean_prepare_smart_gas(file_paths[i])
        smart_resampled, gas_resampled = resample_smart_gas(smart, gas)
        df = merge_smart_gas_weather(smart_resampled, gas_resampled, weather)
        print('Finished iteration: %s' % i)

In [None]:
%timeit main()

Finished iteration: 0
