In [1]:
import re
import glob
import csv
from helpers import config
from helpers.loading import load_daily_data ,file_exist,get_all_dates
from helpers.algorithm import find_best_delay
import pandas as pd
import time
%load_ext autoreload
%autoreload 2

In [2]:
all_dates = get_all_dates(config["stock"])
print(f"{len(all_dates)} dates to process")

3034 dates to process


In [4]:
fieldnames = ['date', 'market1','market2',"lag"]

results_path = config["files"]["results"]["all_best_lags"] # file where to write the computed lags

result_file_exists = file_exist(results_path) 
csvfile = open(results_path, 'a', newline='') 
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

if result_file_exists:
    processed_dates = set(pd.read_csv(results_path).date.unique())
else: 
    # if the file is new, we need to write headers
    writer.writeheader()
    processed_dates = set()


max_iterations = 50
start_time = time.time()
date_count = 0 # number of dates processed
for date_id,date in enumerate(all_dates):
    print(f"date:{date}, {date_id}:{len(all_dates)}, {100*date_id/len(all_dates):0.3f}%", end="\r")

    daily_data = load_daily_data(date,)
    if not daily_data:
        # in case all markets do not provide data for the given date, we skip the date
        continue
        

    
    # we skip the current date if it has already been processed
    if date in processed_dates:
        continue
        
    for i,n1 in enumerate(daily_data):
        for j,n2 in enumerate(daily_data):
            if i>j: # avoid symetric (corr(a,b)=corr(b,a)) and meaningless (corr(a,a)=1) calculations
                best_delay, delays, correlations, los, his = find_best_delay(daily_data,n1,n2,step_size=1000)
                # write the computed result
                writer.writerow({'date': date, 'market1': n1,'market2': n2,'lag': best_delay})
                writer.writerow({'date': date, 'market1': n2,'market2': n1,'lag': -best_delay})
    csvfile.flush() # flush every time we processed a date
    date_count+=1
    if date_count>=max_iterations:
        break
print()
print(f"{date_count} dates processed in {time.time()-start_time:0.2f}s")
csvfile.close()

date:2005-10-10, 61:3034, 2.011%
50 dates processed in 109.84s


In [5]:
daily_data

{'GB':                                       price
 date                                       
 2005-10-10 07:00:39.219000320+00:00  1800.0
 2005-10-10 07:00:41.419999872+00:00  1800.0
 2005-10-10 07:01:04.438000256+00:00  1800.0
 2005-10-10 07:02:05.926000256+00:00  1799.0
 2005-10-10 07:02:06.269999872+00:00  1799.0
 ...                                     ...
 2005-10-10 15:28:27.761999744+00:00  1782.0
 2005-10-10 15:28:27.952000+00:00     1782.0
 2005-10-10 15:29:06.245000448+00:00  1783.0
 2005-10-10 15:30:40.233000320+00:00  1782.2
 2005-10-10 15:33:31.381000064+00:00  1784.0
 
 [763 rows x 1 columns],
 'US':                                      price
 date                                      
 2005-10-10 13:30:08.840000+00:00     63.04
 2005-10-10 13:30:18.760000+00:00     63.03
 2005-10-10 13:31:04.359999744+00:00  63.03
 2005-10-10 13:31:07.230999808+00:00  63.04
 2005-10-10 13:32:08.968999680+00:00  63.02
 ...                                    ...
 2005-10-10 19:59:43.359