To connect to the dashboard, type the following in windows cmd of your own laptop:  
`ssh -NL 3000:localhost:8787 16011015@datascience.hhs.nl`  

This will port localhost:8787 (from the datascience.hhs.nl server) to localhost:3000 on windows.  
To connect to the dashboard, open `http://localhost:3000/status` in your browser.  

# Imports

In [1]:
import numpy as np
import glob
import pandas as pd

from dask.distributed import Client, LocalCluster
from dask import delayed
from tqdm import tqdm

# Initialize Dask

In [2]:
cluster = LocalCluster(processes=False, n_workers=1)
client = Client(cluster, processes=True)

In [3]:
client

0,1
Client  Scheduler: inproc://145.52.252.19/20464/1  Dashboard: http://localhost:8787/status,Cluster  Workers: 1  Cores: 48  Memory: 67.18 GB


# Defining functions

In [11]:
def dwelling_data_paths(sample_rate, unprocessed=True):
    """
    Reads in the file paths and dwelling id's of the combined smartmeter data.
    :return: file_paths, dwelling_ids, both as lists.
    """
    path = '//datc//opschaler//combined_gas_smart_weather_dfs//'
    
    # set folder and subscript values to one hour sample rate
    folder = 'processed//'
    subscript = '_hour'
    
    # change folder and subscript values if sample_rate is 10s
    if unprocessed:
        folder = 'unprocessed//'
    if sample_rate == '10s':
        subscript = '_10s'
        
    complete_path = path+folder+'*'+subscript+".csv"
    file_paths = np.array(glob.glob(complete_path)) # find filepaths for files matching the complete_path pattern

    print('complete_path: '+complete_path)
    print('Detected %s smartmeter_data files.' % len(file_paths))
    
    dwelling_ids = np.array(list((map(lambda x: x[-20:-9], file_paths)))) # hour ids slicing
    
    if sample_rate == '10s':
        dwelling_ids = np.array(list((map(lambda x: x[-16:-8], file_paths)))) # 10s ids slicing
    
    
    return file_paths, dwelling_ids


def read_combined_df(path, dwelling_id):
    df = pd.read_csv(path, delimiter='\t', parse_dates=['datetime'])
    df = df.set_index(['datetime'])
    df = reduce_memory(df)
    df['dwelling'] = str(dwelling_id)
    #df.dwelling = df.dwelling.astype(str)
    return df

def resample_df(df, sample_rate):
    """
    Resampled a (un)processed dataframe to the specified sample_rate.
    Input is a (un)processed df.
    Sample rate must be a string. 
    For example '1H', '1D', '60s'.
    """
    
    input_df = df
    
    to_last = ['gasMeter', 'eMeter'] # resample.last(), take last known value from the columns
    
    input_df[to_last] = input_df[to_last].resample(sample_rate).last() # take last value most nearby the sample_rate
    
    input_df = input_df.resample(sample_rate).mean() # resample to rest by mean
    
    return input_df

def reduce_memory(df):
    """
    Reduces memory footprint of the input dataframe.
    Changes float64 columns to float32 dtype.
    """
    columns = df.columns
    memory_before = df.memory_usage(deep=False).sum() / 2**30 # convert bytes to GB

    for column in columns:
        if df[column].dtype == 'float64':
            df[column] = df[column].astype('float32')
        
    memory_after = df.memory_usage(deep=False).sum() / 2**30 # convert bytes to GB
    #print('Memory uasge reduced from %.3f GB to %.3f GB' % (memory_before, memory_after))
    
    return df

In [5]:
paths, dwelling_ids = dwelling_data_paths('hour')

complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_hour.csv
Detected 52 smartmeter_data files.


# Create the main functions
One that uses Dask and one that does not.  
This way the speed difference can be seen.

In [6]:
def use_dask(sample_rate):
    paths, dwelling_ids = dwelling_data_paths(sample_rate)
    dfs = []

    for i, path in enumerate(paths):
        dwelling_id = dwelling_ids[i]
        df = delayed(read_combined_df)(path, dwelling_id)
        #df = resample_df(df, sample_rate)
        dfs.append(df)
    
    dfs_computed = client.compute(dfs)
    
    results = []
    for res in dfs_computed:
        results.append(res.result())
        
    final_df = pd.concat(results)
    
    return final_df

In [7]:
def not_dask(sample_rate):
    paths, dwelling_ids = dwelling_data_paths(sample_rate)
    dfs = []

    for i, path in enumerate(paths):
        dwelling_id = dwelling_ids[i]
        df = read_combined_df(path, dwelling_id)
        dfs.append(df)
        
    final_df = pd.concat(dfs)
    
    return final_df

# Result for the hour sample rate dataframes

In [8]:
#timeit result: 608 ms
%time a = not_dask('hour')

100%|██████████| 24/24 [00:00<00:00, 4014.81it/s]
100%|██████████| 24/24 [00:00<00:00, 3049.94it/s]
100%|██████████| 24/24 [00:00<00:00, 4060.48it/s]
100%|██████████| 24/24 [00:00<00:00, 4094.83it/s]
100%|██████████| 24/24 [00:00<00:00, 4506.98it/s]
100%|██████████| 24/24 [00:00<00:00, 3968.90it/s]
100%|██████████| 24/24 [00:00<00:00, 4279.54it/s]
100%|██████████| 24/24 [00:00<00:00, 3933.54it/s]
100%|██████████| 24/24 [00:00<00:00, 3621.37it/s]

complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_hour.csv
Detected 52 smartmeter_data files.



100%|██████████| 24/24 [00:00<00:00, 4524.40it/s]
100%|██████████| 24/24 [00:00<00:00, 4521.35it/s]
100%|██████████| 24/24 [00:00<00:00, 3108.23it/s]
100%|██████████| 24/24 [00:00<00:00, 3939.70it/s]
100%|██████████| 24/24 [00:00<00:00, 3786.90it/s]
100%|██████████| 24/24 [00:00<00:00, 3377.17it/s]
100%|██████████| 24/24 [00:00<00:00, 2971.26it/s]
100%|██████████| 24/24 [00:00<00:00, 3141.41it/s]
100%|██████████| 24/24 [00:00<00:00, 3675.86it/s]
100%|██████████| 24/24 [00:00<00:00, 3647.35it/s]
100%|██████████| 24/24 [00:00<00:00, 3695.29it/s]
100%|██████████| 24/24 [00:00<00:00, 2889.39it/s]
100%|██████████| 24/24 [00:00<00:00, 3798.18it/s]
100%|██████████| 24/24 [00:00<00:00, 3048.19it/s]
100%|██████████| 24/24 [00:00<00:00, 4637.15it/s]
100%|██████████| 24/24 [00:00<00:00, 3567.98it/s]
100%|██████████| 24/24 [00:00<00:00, 2775.08it/s]
100%|██████████| 24/24 [00:00<00:00, 3978.63it/s]
100%|██████████| 24/24 [00:00<00:00, 3470.07it/s]
100%|██████████| 24/24 [00:00<00:00, 4318.27it/s]

CPU times: user 1.08 s, sys: 120 ms, total: 1.2 s
Wall time: 1.08 s


#timeit result: 628 ms
%time b = use_dask('hour')

# Result for the 10 second sample rate dataframes

#timeit result: 2 min 35 s
%timeit c = not_dask('10s')

#timeit result: 1 min 17s
%timeit d = use_dask('10s')

# Compute and save the result

In [13]:
def save_result(ten_seconds, hour):
    ten_seconds.to_csv('//datc//opschaler//combined_gas_smart_weather_dfs//processed//all_dwellings_combined_10s.csv', sep='\t')
    hour.to_csv('//datc//opschaler//combined_gas_smart_weather_dfs//processed//all_dwellings_combined_hour.csv', sep='\t')

ten_seconds = use_dask('10s')
hour = use_dask('hour')

print('Ready to save data')

complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_10s.csv
Detected 51 smartmeter_data files.




complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_hour.csv
Detected 52 smartmeter_data files.
Ready to save data


In [None]:
%time save_result(ten_seconds, hour)