In [23]:
import re
import glob
import csv
from helpers import config
from helpers.loading import load_daily_data ,file_exist
from helpers.algorithm import find_best_delay
import pandas as pd
import time

In [24]:

def get_all_dates(stock,signal):
    """return a sorted list of all dates were trades/bbo (signal) are available in the data"""
    
    def extract_date(s):
        try:
            date = re.search(r"[0-9]{4}-[0-9]{2}-[0-9]{2}",s).group(0)
        except :
            print(s)
        return date
    all_files = glob.glob(f"./Data/{signal}/*/*")
    all_dates = [extract_date(s) for s in all_files] 
    all_dates = list(set(all_dates))
    all_dates.sort()
    return all_dates

In [26]:
all_dates = get_all_dates(config["stock"],config["signal"])
print(f"{len(all_dates)} dates to process")

261 dates to process


In [35]:
fieldnames = ['date', 'market1','market2',"lag"]

results_path = config["files"]["results"]["all_best_lags"] # file where to write the computed lags

result_file_exists = file_exist(results_path) 
csvfile = open(results_path, 'a', newline='') 
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

if result_file_exists:
    processed_dates = set(pd.read_csv(results_path).date.unique())
else: 
    # if the file is new, we need to write headers
    writer.writeheader()
    processed_dates = set()


max_iterations = 10
start_time = time.time()
date_count = 0 # number of dates processed
for date_id,date in enumerate(all_dates):
    print(f"data:{date}, {date_id}:{len(all_dates)}, {100*date_id/len(all_dates):0.3f}%", end="\r")
    try:
        daily_data = load_daily_data(date)
    except:
        # in case all markets do not provide data for the given date, we skip the date
        continue
    
    # we skip the current date if it has already been processed
    if date in processed_dates:
        continue
        
    for i,n1 in enumerate(daily_data):
        for j,n2 in enumerate(daily_data):
            if i>j: # avoid symetric (corr(a,b)=corr(b,a)) and meaningless (corr(a,a)=1) calculations
                best_delay, delays, correlations, los, his = find_best_delay(daily_data,n1,n2,step_size=1000)
                # write the computed result
                writer.writerow({'date': date, 'market1': n1,'market2': n2,'lag': best_delay})
                writer.writerow({'date': date, 'market1': n2,'market2': n1,'lag': -best_delay})
    csvfile.flush() # flush every time we processed a date
    date_count+=1
    if date_count>=max_iterations:
        break
print(f"{date_count} dates processed in {time.time()-start_time:0.2f}s")
csvfile.close()

missing data : 2010-01-01 B00%
missing data : 2010-01-01 C
missing data : 2010-01-01 DF
missing data : 2010-01-01 II
missing data : 2010-01-01 MW
missing data : 2010-01-01 O
missing data : 2010-01-01 OQ
missing data : 2010-01-01 P
missing data : 2010-01-18 B215%
missing data : 2010-01-18 C
missing data : 2010-01-18 DF
missing data : 2010-01-18 II
missing data : 2010-01-18 MW
missing data : 2010-01-18 O
missing data : 2010-01-18 OQ
missing data : 2010-01-18 P
missing data : 2010-02-15 B.877%
missing data : 2010-02-15 C
missing data : 2010-02-15 DF
missing data : 2010-02-15 II
missing data : 2010-02-15 MW
missing data : 2010-02-15 O
missing data : 2010-02-15 OQ
missing data : 2010-02-15 P
10 dates processed in 264.95s59%
