In [None]:
import re
import glob
import csv
from helpers import config
from helpers.loading import *
from helpers.algorithm import find_best_delay 
from helpers.preprocessing import *
from helpers.dask import *
import pandas as pd
import dask.dataframe as dd
import time
import dask 
%load_ext autoreload
%autoreload 2


In [None]:
all_dates = get_all_dates()
print(f"{len(all_dates)} dates to process")
signal = config['signal']
print(f"working on signal : {signal}")

In [None]:
fieldnames = ['date','market',"period"]

@dask.delayed
def compute_liquidity(start_date_idx,end_date_idx,verbose=0):
    # file where to write the computed prediods
    results_path = config["files"]["results"][signal]["dask_calculation"]["liquidity"].format(f"{start_date_idx}_{end_date_idx}")

    result_file_exists = file_exist(results_path) 
    csvfile = open(results_path, 'a', newline='') 
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    if result_file_exists:
        processed_dates = set(pd.read_csv(results_path).date.unique())
    else: 
        # if the file is new, we need to write headers
        writer.writeheader()
        processed_dates = set()


    max_iterations = 5000
    start_time = time.time()
    date_count = 0 # number of dates processed
    dates_to_process = all_dates[start_date_idx:end_date_idx]
    for date_id,date in enumerate(dates_to_process):
        if verbose >0:
            print(f"date:{date}, {date_id}:{len(dates_to_process)}, {100*date_id/len(dates_to_process):0.3f}%", end="\r")

        daily_data = load_daily_data(date,preprocessing_steps=["numeric"])
        if not daily_data:
            # in case all markets do not provide data for the given date, we skip the date
            continue

        # we skip the current date if it has already been processed
        if date in processed_dates:
            continue

        for market in daily_data:
            period = daily_data[market].reset_index().date.diff(1).median().total_seconds()
            writer.writerow({'date': date, 'market': market,"period" : period})

        csvfile.flush() # flush every time we processed a date
        date_count+=1
        if date_count>=max_iterations:
            break
    print()
    print(f"{date_count} dates processed in {time.time()-start_time:0.2f}s")
    csvfile.close()
    return results_path

In [None]:
N = len(all_dates)
k = 5 # number of partitions
t = N//k # number of dates to process per worker
dask.config.set(scheduler="processes")

In [None]:
def compute_liquidity_dask():
    promises = []
    for start_date_idx in range(0,N,t):
        end_date_idx = start_date_idx+t
        promise = compute_liquidity(start_date_idx,end_date_idx)
        promises.append(promise)
        
    dask_compututation(promises,config["files"]["results"][signal]["liquidity"])

In [None]:
compute_liquidity_dask()