In [None]:
import re
import glob
import csv
from helpers import config
from helpers.loading import load_daily_data, file_exist, get_all_dates
from helpers.delay import generate_delayed_data
import pandas as pd
import time
%load_ext autoreload
%autoreload 2


In [None]:
all_dates = get_all_dates(config["stock"])
print(f"{len(all_dates)} dates to process")


In [None]:
fieldnames = ['date', 'market1', 'market2', "lag"]
preprocessing_steps = ['numeric']

# file where to write the computed lags
results_path = config["files"]["results"]["all_best_lags"].format(
    "_".join(preprocessing_steps))

result_file_exists = file_exist(results_path)
csvfile = open(results_path, 'a', newline='')
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

if result_file_exists:
    processed_dates = set(pd.read_csv(results_path).date.unique())
else:
    # if the file is new, we need to write headers
    writer.writeheader()
    processed_dates = set()


max_iterations = 5000
start_time = time.time()
date_count = 0  # number of dates processed
for date_id, date in enumerate(all_dates[::1]):
    print(
        f"date:{date}, {date_id}:{len(all_dates)}, {100*date_id/len(all_dates):0.3f}%", end="\r")

    daily_data = load_daily_data(date, preprocessing_steps=preprocessing_steps)
    if not daily_data:
        # in case all markets do not provide data for the given date, we skip the date
        continue

    # we skip the current date if it has already been processed
    if date in processed_dates:
        continue

    for i, n1 in enumerate(daily_data):
        for j, n2 in enumerate(daily_data):
            # avoid symetric (corr(a,b)=corr(b,a)) and meaningless (corr(a,a)=1) calculations
            if i > j:
                best_delay, delays, correlations, los, his = find_best_delay(
                    daily_data, n1, n2, step_size=1000)
                # write the computed result
                writer.writerow({'date': date, 'market1': n1,
                                'market2': n2, 'lag': best_delay})
                writer.writerow({'date': date, 'market1': n2,
                                'market2': n1, 'lag': -best_delay})
    csvfile.flush()  # flush every time we processed a date
    date_count += 1
    if date_count >= max_iterations:
        break
print()
print(f"{date_count} dates processed in {time.time()-start_time:0.2f}s")
csvfile.close()
