In [9]:
# Imports
import numpy as np
import glob
import pandas as pd

from dask.distributed import Client, LocalCluster
from dask import delayed

In [10]:
# Initialize Dask

In [11]:
cluster = LocalCluster(processes=False, n_workers=1)
client = Client(cluster, processes=True)

In [12]:
client

0,1
Client  Scheduler: inproc://145.52.252.19/117193/1  Dashboard: http://localhost:8787/status,Cluster  Workers: 1  Cores: 48  Memory: 67.18 GB


# Defining functions

In [13]:
def smartmeter_data(sample_rate, unprocessed=True):
    """
    Reads in the file paths and dwelling id's of the combined smartmeter data.
    :return: file_paths, dwelling_ids, both as lists.
    """
    path = '//datc//opschaler//combined_gas_smart_weather_dfs//'
    folder = 'processed//'
    subscript = '_hour'
    
    if unprocessed:
        folder = 'unprocessed//'
    if sample_rate == '10s':
        subscript = '_10s'
        
    complete_path = path+folder+'*'+subscript+".csv"
    file_paths = np.array(glob.glob(complete_path))

    print('complete_path: '+complete_path)
    print('Detected %s smartmeter_data files.' % len(file_paths))
    
    dwelling_ids = np.array(list((map(lambda x: x[-20:-9], file_paths)))) # hour ids slicing
    
    if sample_rate == '10s':
        dwelling_ids = np.array(list((map(lambda x: x[-16:-8], file_paths)))) # 10s ids slicing
    
    
    return file_paths, dwelling_ids


def read_combined_df(path, dwelling_id):
    df = pd.read_csv(path, delimiter='\t', parse_dates=['datetime'])
    df = df.set_index(['datetime'])
    df = df.astype('float32')
    df['dwelling'] = str(dwelling_id)
    #df.dwelling = df.dwelling.astype(str)
    return df

In [14]:
paths, dwelling_ids = smartmeter_data('10s')

complete_path: //datc//opschaler//combined_gas_smart_weather_dfs//unprocessed//*_10s.csv
Detected 56 smartmeter_data files.


In [15]:
def use_dask(paths, dwelling_ids):
    dfs_hour = []

    for i, path in enumerate(paths):
        dwelling_id = dwelling_ids[i]
        df = delayed(read_combined_df)(path, dwelling_id)
        dfs_hour.append(df)
    
    dfs_hour_computed = client.compute(dfs_hour)
    
    results = []
    for result in dfs_hour_computed:
        result.append(result.result())
        
    final_df = pd.concat(t)
    
    return final_df

In [18]:
def not_dask(paths, dwelling_ids):
    dfs_hour = []

    for i, path in enumerate(paths):
        dwelling_id = dwelling_ids[i]
        df = read_combined_df(path, dwelling_id)
        dfs_hour.append(df)
        
    final_df = pd.concat(dfs_hour)
    
    return final_df

In [19]:
%time a = not_dask(paths, dwelling_ids)

CPU times: user 2min 31s, sys: 23.3 s, total: 2min 54s
Wall time: 2min 38s


In [None]:
final.info()