# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
mpl.style.use('default')
import glob
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from dask import delayed

%matplotlib inline

# Setting up LocalCluster & run it
Might not be needed, but experienced that `processes=False, n_workers=1` are the best options to have.

In [2]:
#cluster = LocalCluster(processes=False, n_workers=8)
cluster = LocalCluster(processes=False, n_workers=1)
#cpu_worker = cluster.workers[0]
#cpu_worker.name = 'cpu'
#cpu_worker.set_resources(CPU=90)

client=Client(cluster)

In [3]:
cluster

In [4]:
client

0,1
Client  Scheduler: inproc://192.168.0.100/15144/1  Dashboard: http://localhost:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 17.02 GB


# Defining functions

In [5]:
def read_weather_data():
    """
    Reads in the weather Pandas DataFrame.
    :return: Pandas DataFrame
    """
    # Check if UTC to gmt+1 conversion is being handled correctly
    weather = pd.read_csv('F://datc//opschaler//weather_data//knmi_10_min_raw_data//output//df_combined_uncleaned.csv',
                          delimiter='\t', comment='#',
                          parse_dates=['datetime'])
    weather = weather.set_index(['datetime'])
    weather = weather.astype('float32')
    return weather


def smartmeter_data():
    """
    Reads in the file paths and dwelling id's of the smartmeter data.
    :return: file_paths, dwelling_ids, both as lists.
    """
    path = 'F:\\datc\\opschaler\\smartmeter_data\\'
    file_paths = np.array(glob.glob(path + "*.csv"))

    print('Detected %s smartmeter_data files.' % len(file_paths))
    dwelling_ids = np.array(list((map(lambda x: x[-15:-4], file_paths))))

    return file_paths, dwelling_ids


@delayed(nout=2)
def clean_prepare_smart_gas(file_path, dwelling_id):
    """
    Input is a dwelling_id.csv file.
    Output are cleaned & prepared dataframes (smart, gas).

    :param file_path: path to 'dwelling_id.csv' file
    :return: Smart and gas Pandas DataFrames
    """
    df = pd.read_csv(file_path, delimiter=';', header=0)
    df = df.rename(index=str, columns={'Timestamp': 'datetime', 'gasTimestamp': 'datetime'})

    smart = df.iloc[:, :7]
    gas = df.iloc[:, 7:]
    
    del df
    
    try:
        smart['datetime'] = pd.to_datetime(smart['datetime'])
        gas['datetime'] = pd.to_datetime(gas['datetime'])
    except:
        print('datetime column contains non-datetime values')
        smart = clean_datetime(smart)
        gas = clean_datetime(gas)
        smart['datetime'] = pd.to_datetime(smart['datetime'])
        gas['datetime'] = pd.to_datetime(gas['datetime'])

    smart = smart.set_index(['datetime'])
    gas = gas.set_index(['datetime'])

    smart = smart.astype(dtype='float32')
    gas = gas.astype(dtype='float32')

    return smart, gas


@delayed
def clean_datetime(df):
    """
    TODO: Speed up the function
    Input should be a df with a column called 'datetime'.
    This function checks wether a row in the df.datetime column can be parsed to a Pandas datetime object,
    by trying pd.to_datetime() on it.
    If it fails it will replace that row with np.nan().
    Finally this function will return the df with the NaN rows dropped.
    It only drops the row if the datetime column contains a NaN.

    :param df: Pandas DataFrame containing a datetime column called 'datetime'.
    :return: Pandas DataFrame
    """
    for i in range(len(df)):
        try:
            pd.to_datetime(df.datetime[i])
        except ValueError:
            print('-----')
            print('ValueError at index = %s' % i)
            print(df.datetime[i])
            df.datetime = df.datetime.replace(df.datetime[i], np.nan)
    df = df.dropna(subset=['datetime'])
    return df


@delayed(nout=3)
def resample_dfs(smart, gas, weather):
    smart = smart.resample('10s').mean()
    gas = gas.resample('H').mean()
    weather = weather.resample('10min').mean()
    return smart, gas, weather


@delayed
def create_hour_df(smart, gas, weather, dwelling_id):
    gas['gasPower'] = gas['gasMeter'].diff()  # Calculate gasPower column
    gas['gasPower'][0] = gas['gasPower'][1]  # Replace 1st entry (NaN) with 2nd entry
    smart = smart.resample('H').mean()  # Down sample smart
    weather = weather.resample('H').mean()  # Down sample weather
    # Combine gas, smart, weather
    df_hour = pd.merge(smart, gas, left_index=True, right_index=True)
    df_hour = pd.merge(df_hour, weather, left_index=True, right_index=True)
    df_hour['dwelling'] = dwelling_id
    
    return df_hour


@delayed
def create_10s_df(smart, gas, weather, dwelling_id):
    gas = gas.resample('10s').ffill()  # Up sample gas to 10s
    # Calculate gasPower column, is this rhe right way? Or should we ffill it?
    # Currently this code makes it so there is one gasPower value per hour, we could ffill this also?
    gas['gasPower'] = gas['gasMeter'].diff()
    gas['gasPower'][0] = gas['gasPower'][1]  # Replace 1st entry (NaN) with 2nd entry
    weather = weather.resample('10s').ffill()  # forward fill because the raw data is the 10 minute mean
    # Combine gas, smart, weather
    df_10s = pd.merge(smart, gas, left_index=True, right_index=True)
    df_10s = pd.merge(df_10s, weather, left_index=True, right_index=True)
    df_10s['dwelling'] = dwelling_id
    return df_10s


@delayed
def plot_nans(df, dwelling_id, resample_to):
    """
    Create a heatmap of the NaNs in the input DataFrame.
    :param df: Pandas DataFrame
    :param df: String to resample to, for example '1T' or 'H'
    :param dwelling_id: String
    :return: Seaborn heatmap as a Figure
    """
    plt.clf()
    df = df.isnull()
    # Downsample to make all data visible
    df = df.resample(resample_to).sum()  # Downsample to make small NaNs visible
    df = df.apply(lambda x: x > 0, 1)  # Replace values >0 with 1

    # Reindex datetimes
    # https://stackoverflow.com/questions/41046630/set-time-formatting-on-a-datetime-index-when-plotting-pandas-series
    try:
        df.index = df.index.to_period('D')
    except:
        print('plot_nans could not set df.index.to_period')

    # Plot heatmap
    n = int(len(df)*0.1)  # Choose amount of yticklabels to show

    try:
        fig = sns.heatmap(df, cmap='Reds', square=False, vmin=0, cbar=False, yticklabels=n*2, cbar_kws={})
    except TypeError:
        print('plot_nans ValueError')
        fig = sns.heatmap(df, cmap='Reds', square=False, vmin=0, cbar=False, cbar_kws={})

    # Set cbar ticks manually
    #cbar = fig.collections[0].colorbar
    #cbar.set_ticks([0, 1])
    #cbar.set_ticklabels(['Not NaN', 'NaN'])

    # Correct layout
    fig.invert_yaxis()
    fig.tick_params(axis='x', rotation=90)
    fig.tick_params(axis='y', rotation=0)
    fig.set(xlabel='Column [-]', ylabel='Index [-]')
    plt.title('Dwelling ID: '+dwelling_id)

    fig = fig.get_figure()
    #fig.tight_layout()
    #fig.show()
    #print('Saving heatmap')
    #fig.savefig('F://datc//opschaler//nan_information//figures//' + dwelling_id + '.png', dpi=1200)
    #savefig crashes dask
    
    return fig


@delayed
def df_nan_checker(df, threshold_percentage):
    """
    TODO: Parellalize, as in one column per core/worker?
    Checks each column in the input dataframe for NaNs.
    Outputs the amount of NaNs behind each other, including the start and stop index, per column as a sublist.
    For example when the dataframe has three columns.
    Output is in the form of:
    [[column_one_info], [column_two_info], [column_three_info]]
    With the column_..._info being in the form of:
    [start_index, stop_index, amount_of_NaNs]

    :param df: Pandas DataFrame
    :param threshold_percentage: Filter output based on NaN streaks being larger than x % of the total length of the dataframe.
    :return: Pandas DataFrame
    """
    columns = df.columns
    df = df.isnull()
    output = []
    length = len(columns)
    
    
    @delayed
    def check_rows(df, column_name):
        column_info = []
        temp = []
        x = False

        for j, value in enumerate(df[column_name]):
            if x == False and value == True:
                temp.append(df.index[j])
                x = True
            elif x == True and value == True:
                temp.append(df.index[j])
            elif x == True and value == False:
                column_info.append(temp)
                temp = []
                x = False

        lengths = []

        for array in column_info:
            lengths.append([array[0], array[-1], len(array)])

        return lengths

    
    for i in range(length):
        lengths = check_rows(df, columns[i])
        output.append(lengths)
    
    @delayed
    def list_to_df(output):
        # Convert df_info to a readable dataframe instead of list

        """
        Row per column from the 'output' list
        Columns: start-index, stop-index, NaN streak
        """

        df_info = pd.DataFrame(columns=['Column name', 'Start index', 'Stop index', 'Amount of NaNs'])
        length = len(output)
        column_names = []
        starts = []
        stops = []
        amounts = []

        for column in range(length):
            #print('At iteration %s of %s' % (column, length))
            for i in range(len(output[column])):
                column_names.append(df.columns[column])
                starts.append(output[column][i][0])
                stops.append(output[column][i][1])
                amounts.append(output[column][i][2])

        print('Appending NaN info to df')
        # Convert list to pd series
        column_names = pd.Series(column_names)
        starts = pd.Series(starts)
        stops = pd.Series(stops)
        amounts = pd.Series(amounts)
        # Append pd series to a column
        df_info['Column name'] = column_names.values
        df_info['Start index'] = starts.values
        df_info['Stop index'] = stops.values
        df_info['Amount of NaNs'] = amounts.values

        percentage = (df_info['Amount of NaNs'] / len(df)) * 100
        df_info.drop(df_info[percentage < threshold_percentage].index, inplace=True)
        return df_info

    df_info = list_to_df(output)
    
    return df_info


def save_df_unprocessed(df, dwelling_id):
    """
    Save unprocessed dataframe.
    :param df: Pandas DataFrame
    :param dwelling_id: String
    :return: None
    """
    dir = 'F://datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//'
    df.to_csv(dir + dwelling_id + '.csv', sep='\t', index=True)
    print('Saved unprocessed df: %s' % dwelling_id)
    return


# Main loop

In [18]:
%%time

client.restart()

weather = read_weather_data()

file_paths, dwelling_ids = smartmeter_data()

file_paths = file_paths[:10]

dfs_hour = []
dfs_10s = []
dfs_nan_table_10s = []
dfs_nan_table_hour = []

for i, path in enumerate(file_paths):
    dwelling_id = dwelling_ids[i]
    
    smart, gas = clean_prepare_smart_gas(path, dwelling_id)
    
    # client.persist: Start computing these variables and keep them in memory
    smart = client.persist(smart)
    gas = client.persist(gas)

    smart, gas, weather = resample_dfs(smart, gas, weather)
    
    df_hour = create_hour_df(smart, gas, weather, dwelling_id)
    df_10s = create_10s_df(smart, gas, weather, dwelling_id)
    
    df_hour = client.persist(df_hour)
    df_10s = client.persist(df_10s)
    
    #Slow, plus low cpu usage...
    #fig = plot_nans(df_10s, dwelling_id+' 10s sample rate', '1T')
    df_nan_table_10s = df_nan_checker(df_10s, 0)
    df_nan_table_hour = df_nan_checker(df_hour, 0)
    
    dfs_hour.append(df_hour)
    dfs_10s.append(df_10s)
    dfs_nan_table_10s.append(df_nan_table_10s)
    dfs_nan_table_hour.append(df_nan_table_hour)

Detected 56 smartmeter_data files.
Wall time: 1.62 s


# Save dataframes
Some unprocessed ones take 230 seconds to save.
It is in parallel, but it is still slow.

In [19]:
"""
Little trick to force run this save function in parallel.
Force compute the df, then submit the save_df_unprocessed function to the scheduler. 
Loop over this, client will process save_df_unprocessed in the back end.
"""

%%time
for i in range (len(dfs_10s)):
    df = dfs_10s[i].compute()
    z = client.submit(save_df_unprocessed, df, (dwelling_ids[i]+'_10s'))

Saved unprocessed df: P01S01W0001_10s
Saved unprocessed df: P01S01W0000_10s
Wall time: 11.2 s
Saved unprocessed df: P01S01W0373_10s
Saved unprocessed df: P01S01W0998_10s
Saved unprocessed df: P01S01W1554_10s
Saved unprocessed df: P01S01W0378_10s
Saved unprocessed df: P01S01W1347_10s
Saved unprocessed df: P01S01W1341_10s
Saved unprocessed df: P01S01W2581_10s
Saved unprocessed df: P01S01W2743_10s


In [40]:
to_save

[<Future: status: finished, type: NoneType, key: save_df_unprocessed-24f11f9b71195a9d1da286f2dd9c7a4c>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-2c46dbebc7c302ba6d516884342d610d>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-07cefcbe09c08ebcd95e21a2540d6a5f>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-52be38e92e4f0e90ccf93b44f7d6548e>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-c533542b3d8bc5224589fcb570f435a8>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-a8a648a09cffe872cc32e9c358791f2d>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-9c3afea8a2a84859c00a405d3838ca5b>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-1a783fb96fcd8e429a634b46c4afcd16>,
 <Future: status: finished, type: NoneType, key: save_df_unprocessed-bbfd24b94f3dc2dab972cebdb3053c68>,
 <Future: status: finished, type: NoneType, key: save_df_unproce

In [49]:
to_save[1].compute()

AttributeError: 'Future' object has no attribute 'compute'

# Compute and save

In [8]:
%%time

dfs_hour = dask.compute(dfs_hour)
dfs_10s = dask.compute(dfs_10s)
dfs_nan_table_10s = dask.compute(dfs_nan_table_10s)
dfs_nan_table_hour = dask.compute(dfs_nan_table_hour)

Wall time: 18 s


In [None]:
%%time

for i in range(len(dfs_hour[0])):
    save_df_unprocessed(dfs_10s[0][i], dwelling_ids[i]+'_10s')
    save_df_unprocessed(dfs_hour[0][i], dwelling_ids[i]+'_hour')
    dfs_nan_table_10s[0][0][i].to_csv('F://datc//opschaler//nan_information//'+dwelling_ids[i]+'_10s.csv', sep='\t')
    dfs_nan_table_hour[0][0][i].to_csv('F://datc//opschaler//nan_information//' + dwelling_ids[i] + '_hour.csv', sep='\t')
    print('Finished iteration %s out of %s.' % (i, len(dfs_hour[0])))

Saved unprocessed df: P01S01W0000_10s
Saved unprocessed df: P01S01W0000_hour
Finished iteration 0 out of 56.
Saved unprocessed df: P01S01W0001_10s
Saved unprocessed df: P01S01W0001_hour
Finished iteration 1 out of 56.
Saved unprocessed df: P01S01W0373_10s
Saved unprocessed df: P01S01W0373_hour
Finished iteration 2 out of 56.
Saved unprocessed df: P01S01W0378_10s
Saved unprocessed df: P01S01W0378_hour
Finished iteration 3 out of 56.
Saved unprocessed df: P01S01W0998_10s
Saved unprocessed df: P01S01W0998_hour
Finished iteration 4 out of 56.
Saved unprocessed df: P01S01W1341_10s
Saved unprocessed df: P01S01W1341_hour
Finished iteration 5 out of 56.
Saved unprocessed df: P01S01W1347_10s
Saved unprocessed df: P01S01W1347_hour
Finished iteration 6 out of 56.
Saved unprocessed df: P01S01W1554_10s
Saved unprocessed df: P01S01W1554_hour
Finished iteration 7 out of 56.
Saved unprocessed df: P01S01W2581_10s
Saved unprocessed df: P01S01W2581_hour
Finished iteration 8 out of 56.
Saved unprocessed d