In [23]:
import re
import glob
import csv
from helpers import config
from helpers.loading import load_daily_data ,file_exist
from helpers.algorithm import find_best_delay
import pandas as pd
import time

In [24]:

def get_all_dates(stock,signal):
    """return a sorted list of all dates were trades/bbo (signal) are available in the data"""
    
    def extract_date(s):
        try:
            date = re.search(r"[0-9]{4}-[0-9]{2}-[0-9]{2}",s).group(0)
        except :
            print(s)
        return date
    all_files = glob.glob(f"./Data/{signal}/*/*")
    all_dates = [extract_date(s) for s in all_files] 
    all_dates = list(set(all_dates))
    all_dates.sort()
    return all_dates

In [26]:
all_dates = get_all_dates(config["stock"],config["signal"])
print(f"{len(all_dates)} dates to process")

261 dates to process


In [35]:
fieldnames = ['date', 'market1','market2',"lag"]

results_path = config["files"]["results"]["all_best_lags"] # file where to write the computed lags

result_file_exists = file_exist(results_path) 
csvfile = open(results_path, 'a', newline='') 
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

if result_file_exists:
    processed_dates = set(pd.read_csv(results_path).date.unique())
else: 
    # if the file is new, we need to write headers
    writer.writeheader()
    processed_dates = set()


max_iterations = 10
start_time = time.time()
date_count = 0 # number of dates processed
for date_id,date in enumerate(all_dates):
    print(f"data:{date}, {date_id}:{len(all_dates)}, {100*date_id/len(all_dates):0.3f}%", end="\r")
    try:
        daily_data = load_daily_data(date)
    except:
        # in case all markets do not provide data for the given date, we skip the date
        continue
    
    # we skip the current date if it has already been processed
    if date in processed_dates:
        continue
        
    for i,n1 in enumerate(daily_data):
        for j,n2 in enumerate(daily_data):
            if i>j: # avoid symetric (corr(a,b)=corr(b,a)) and meaningless (corr(a,a)=1) calculations
                best_delay, delays, correlations, los, his = find_best_delay(daily_data,n1,n2,step_size=1000)
                # write the computed result
                writer.writerow({'date': date, 'market1': n1,'market2': n2,'lag': best_delay})
                writer.writerow({'date': date, 'market1': n2,'market2': n1,'lag': -best_delay})
    csvfile.flush() # flush every time we processed a date
    date_count+=1
    if date_count>=max_iterations:
        break
print()
print(f"{date_count} dates processed in {time.time()-start_time:0.2f}s")
csvfile.close()

missing data : 2010-01-01 B00%
missing data : 2010-01-01 C
missing data : 2010-01-01 DF
missing data : 2010-01-01 II
missing data : 2010-01-01 MW
missing data : 2010-01-01 O
missing data : 2010-01-01 OQ
missing data : 2010-01-01 P
missing data : 2010-01-18 B215%
missing data : 2010-01-18 C
missing data : 2010-01-18 DF
missing data : 2010-01-18 II
missing data : 2010-01-18 MW
missing data : 2010-01-18 O
missing data : 2010-01-18 OQ
missing data : 2010-01-18 P
missing data : 2010-02-15 B.877%
missing data : 2010-02-15 C
missing data : 2010-02-15 DF
missing data : 2010-02-15 II
missing data : 2010-02-15 MW
missing data : 2010-02-15 O
missing data : 2010-02-15 OQ
missing data : 2010-02-15 P
10 dates processed in 264.95s59%


In [38]:
s1,s2 = daily_data["B"],daily_data["C"]

In [60]:
def generate_delayed_data(s1, s2, delay, join_type="outer"):
    s1 = s1.copy()
    s2 = s2.copy()
    
    h = min(s1.index.max(),s2.index.max())
    l = max(s1.index.min(),s2.index.min())
    
    s1.index = s1.index + pd.Timedelta(milliseconds=delay)
    pair_data = s1.join(s2, how=join_type, lsuffix="_1",
                        rsuffix="_2").ffill().dropna()
    return pair_data[(pair_data.index>=l)*(pair_data.index<=l)]

In [40]:
generate_delayed_data(s1, s2,0)

Unnamed: 0_level_0,trade-price_1,trade-price_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-02-24 14:30:15.081000192+00:00,28.56,28.54
2010-02-24 14:30:15.985999872+00:00,28.56,28.55
2010-02-24 14:30:15.987000064+00:00,28.56,28.55
2010-02-24 14:30:16.272000256+00:00,28.56,28.55
2010-02-24 14:30:17.670000384+00:00,28.56,28.55
...,...,...
2010-02-24 20:59:58.011999744+00:00,28.64,28.65
2010-02-24 20:59:58.439000320+00:00,28.64,28.65
2010-02-24 20:59:58.450000128+00:00,28.64,28.65
2010-02-24 20:59:59.119000064+00:00,28.65,28.65


In [61]:
h = min(s1.index.max(),s2.index.max())
l = max(s1.index.min(),s2.index.min())
print(l,h)

2010-02-24 14:30:15.081000192+00:00 2010-02-24 20:58:32.362000384+00:00


In [63]:
(s1.index.min(),s2.index.min())

(Timestamp('2010-02-24 14:30:03.801999616+0000', tz='UTC'),
 Timestamp('2010-02-24 14:30:15.081000192+0000', tz='UTC'))

In [58]:
s2

Unnamed: 0_level_0,trade-price
date,Unnamed: 1_level_1
2010-02-24 14:30:15.081000192+00:00,28.54
2010-02-24 14:30:15.985999872+00:00,28.55
2010-02-24 14:30:15.987000064+00:00,28.55
2010-02-24 14:30:18.273000192+00:00,28.56
2010-02-24 14:30:34.225000960+00:00,28.57
...,...
2010-02-24 20:56:19.274999552+00:00,28.61
2010-02-24 20:57:16.823000320+00:00,28.62
2010-02-24 20:57:47.835999744+00:00,28.63
2010-02-24 20:58:01.381999616+00:00,28.63


In [62]:
s1

Unnamed: 0_level_0,trade-price
date,Unnamed: 1_level_1
2010-02-24 14:30:03.801999616+00:00,28.52
2010-02-24 14:30:03.812000+00:00,28.50
2010-02-24 14:30:06.735999744+00:00,28.55
2010-02-24 14:30:06.775999744+00:00,28.55
2010-02-24 14:30:06.797000192+00:00,28.55
...,...
2010-02-24 20:59:58.011999744+00:00,28.64
2010-02-24 20:59:58.439000320+00:00,28.64
2010-02-24 20:59:58.450000128+00:00,28.64
2010-02-24 20:59:59.119000064+00:00,28.65
