# Imports

In [1]:
import numpy as np
import glob
import pandas as pd

from dask.distributed import Client, LocalCluster
from dask import delayed

# Initialize Dask

In [2]:
cluster = LocalCluster(processes=False, n_workers=1)
client = Client(cluster, processes=True)

In [3]:
client

0,1
Client  Scheduler: inproc://145.52.252.19/143796/1  Dashboard: http://localhost:8787/status,Cluster  Workers: 1  Cores: 48  Memory: 67.18 GB


# Defining functions

In [4]:
def dwelling_data_paths(sample_rate, unprocessed=True):
    """
    Reads in the file paths and dwelling id's of the combined smartmeter data.
    :return: file_paths, dwelling_ids, both as lists.
    """
    path = '//datc//opschaler//combined_gas_smart_weather_dfs//'
    folder = 'processed//'
    subscript = '_hour'
    
    if unprocessed:
        folder = 'unprocessed//'
    if sample_rate == '10s':
        subscript = '_10s'
        
    complete_path = path+folder+'*'+subscript+".csv"
    file_paths = np.array(glob.glob(complete_path))

    print('complete_path: '+complete_path)
    print('Detected %s smartmeter_data files.' % len(file_paths))
    
    dwelling_ids = np.array(list((map(lambda x: x[-20:-9], file_paths)))) # hour ids slicing
    
    if sample_rate == '10s':
        dwelling_ids = np.array(list((map(lambda x: x[-16:-8], file_paths)))) # 10s ids slicing
    
    
    return file_paths, dwelling_ids


def read_combined_df(path, dwelling_id):
    df = pd.read_csv(path, delimiter='\t', parse_dates=['datetime'])
    df = df.set_index(['datetime'])
    df = df.astype('float32')
    df['dwelling'] = str(dwelling_id)
    #df.dwelling = df.dwelling.astype(str)
    return df

In [5]:
paths, dwelling_ids = dwelling_data_paths('10s')

complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_10s.csv
Detected 56 smartmeter_data files.


# Create the main functions
One that uses Dask and one that does not.  
This way the speed difference can be seen.

In [6]:
def use_dask(sample_rate):
    paths, dwelling_ids = dwelling_data_paths(sample_rate)
    dfs_hour = []

    for i, path in enumerate(paths):
        dwelling_id = dwelling_ids[i]
        df = delayed(read_combined_df)(path, dwelling_id)
        dfs_hour.append(df)
    
    dfs_hour_computed = client.compute(dfs_hour)
    
    results = []
    for res in dfs_hour_computed:
        results.append(res.result())
        
    final_df = pd.concat(results)
    
    return final_df

In [7]:
def not_dask(sample_rate):
    paths, dwelling_ids = dwelling_data_paths(sample_rate)
    dfs_hour = []

    for i, path in enumerate(paths):
        dwelling_id = dwelling_ids[i]
        df = read_combined_df(path, dwelling_id)
        dfs_hour.append(df)
        
    final_df = pd.concat(dfs_hour)
    
    return final_df

# Result for the hour sample rate dataframes

#timeit result: 608 ms
%timeit a = not_dask('hour')

#timeit result: 628 ms
%timeit b = use_dask('hour')

# Result for the 10 second sample rate dataframes

#timeit result: 2 min 35 s
%timeit c = not_dask('10s')

#timeit result: 1 min 17s
%timeit d = use_dask('10s')

# Save the actual result

In [None]:
def save_result(ten_seconds, hour):
    ten_seconds.to_csv('//datc//opschaler//combined_gas_smart_weather_dfs//processed//all_dwellings_combined_10s.csv', sep='\t')
    hour.to_csv('//datc//opschaler//combined_gas_smart_weather_dfs//processed//all_dwellings_combined_hour.csv', sep='\t')

ten_seconds = use_dask('10s')
hour = use_dask('hour')

print('Ready to save data')

complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_10s.csv
Detected 56 smartmeter_data files.
complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_hour.csv
Detected 56 smartmeter_data files.
Ready to save data


In [None]:
%timeit save_result(ten_seconds, hour)