In [1]:
from helpers import config
import plotly.io as pio
import pandas as pd
import dask
dask.config.set(scheduler="processes")
import matplotlib.pyplot as plt
import glob
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.renderers.default = 'notebook_connected'
%load_ext autoreload
%autoreload 2

# Loading the data

work on a daily basis

In [12]:
from helpers import config
from helpers.loading import Loader
date = config['test_date']
us_daily_data = Loader(dataset='US_sample', preprocessing_steps=['numeric']).load_daily_data(date)


missing data : market A for 2010-01-07 
missing data : market B for 2010-01-07 
missing data : market C for 2010-01-07 
missing data : market DF for 2010-01-07 
missing data : market II for 2010-01-07 
missing data : market MW for 2010-01-07 
missing data : market O for 2010-01-07 
missing data : market OQ for 2010-01-07 
missing data : market P for 2010-01-07 
missing data : market PH for 2010-01-07 
missing data : market Z for 2010-01-07 


In [13]:
daily_data = Loader(dataset='transatlantic').load_daily_data(date)


# Data Exploration

## Number of data points (transactions) per exchange

In [14]:
from helpers.plots import save_plot
def nb_transaction_per_exchange():
    sizes = [len(v) for _,v in daily_data.items()]
    size_df = pd.DataFrame({'markets': daily_data.keys(), 'size': sizes})
    fig = px.bar(
        size_df.sort_values('size'), 
        x='markets', y='size',
        title=f"Number of transaction per exchange ({config['test_date']})"
    )
    save_plot(fig, "nb_transaction_per_exchange")
    fig.show()

In [15]:
from helpers.delay import generate_delayed_data

def nb_transaction_join_market_pars():
    N = len(config['transatlantic']['markets']['list'])
    joined_len_outer = np.zeros((N, N))
    joined_len_inner = np.zeros((N, N))
    for i, k1 in enumerate(daily_data):
        for j, k2 in enumerate(daily_data):
            s1, s2 = daily_data[k1].copy(), daily_data[k2].copy()
            l_outer = len(generate_delayed_data(s1, s2, 0))
            l_inner = len(generate_delayed_data(s1, s2, 0, join_type="inner"))
            joined_len_outer[i, j] = l_outer
            joined_len_inner[i, j] = l_inner

    def plot():
        fig = make_subplots(rows=1, cols=2, shared_yaxes=True,
                            subplot_titles=["inner", "outer"])
        cbarlocs = [.85, .5, .15]
        zmax = [2, 5, 10]

        fig.add_trace(
            go.Heatmap(z=joined_len_inner,
                       x=config['transatlantic']['markets']['list'], y=config['transatlantic']['markets']['list'], coloraxis="coloraxis"),
            row=1, col=1
        )

        fig.add_trace(
            go.Heatmap(z=joined_len_outer,
                       x=config['transatlantic']['markets']['list'], y=config['transatlantic']['markets']['list'], coloraxis="coloraxis"),
            row=1, col=2
        )
        fig.update_layout(height=600, width=800,
                          title_text=f"Number of transaction in the joined time-series (for different market pairs at {config['test_date']})",
                          coloraxis={'colorscale': 'viridis'})
        save_plot(fig, 'nb_transaction_join_market_pairs')
        fig.show()
    
    plot()


In [16]:
nb_transaction_per_exchange()
nb_transaction_join_market_pars()

# Signal visu

In [17]:
from helpers.stats import compute_correlation
from helpers.delay import compute_delays
from helpers.plots import save_plot

def format_delay_df(n1, n2, daily_data=us_daily_data):
    delays, correlations, _, _ = compute_delays(
        daily_data, n1, n2, step_size=10000, n_step=20
    )
    N, D = daily_data[n1].shape[0], len(delays)
    df = pd.DataFrame(columns=[n1,  n2, 'delay', 'correlation'])
    for i, (delay, correlation) in enumerate(zip(delays, correlations)):
        s1, s2 = daily_data[n1].copy(), daily_data[n2].copy()
        pair_data = generate_delayed_data(s1, s2, delay).reset_index()
        pair_data.price_1 = (pair_data.price_1 -
                             pair_data.price_1.median())/pair_data.price_1.std()
        pair_data.price_2 = (pair_data.price_2 - pair_data.price_2.median())/pair_data.price_2.std()
        pair_data = pair_data.rename(
            columns={'price_1': n1, 'price_2': n2})
        pair_data['delay'] = delay
        pair_data['correlation'] = correlation
        df = pd.concat((df, pair_data), axis=0, ignore_index=True)

    df.delay = df.delay//1000
    return pd.melt(df, id_vars=['date', 'delay', 'correlation'], value_vars=[
        n1, n2], var_name='market', value_name='price')


def visualize_delay(n1="II", n2="B"):
    import plotly.io as pio
    pio.renderers.default = 'notebook_connected'

    delay_df = format_delay_df(n1, n2)
    fig = px.line(delay_df, x="date", y='price',
                animation_frame="delay", animation_group='market',
                color="market",
                hover_name='correlation', hover_data={'date': False,  'delay': False, 'market': False, 'price': False},
                title=f"Microsoft trade prices on different markets & correlation for different delay (in seconds)"
                )

    fig.update_layout(hovermode="x unified")
    fig["layout"].pop("updatemenus")
    save_plot(fig, 'motivation')
    fig.show()


In [18]:
visualize_delay()


KeyError: 'II'

# Demo

In [19]:
from helpers.delay import compute_delays
from helpers.algorithm import  increasing_function_check
def find_best_delay_demo(n1,n2):
    n_iteration = 15
    center      = 0
    step_size   = 120_000 #ms
    last_best_delay = None
    for it in range(n_iteration):
        delays,correlations,los,his = compute_delays(daily_data,n1,n2,center=center,step_size=step_size)
        best_delay = delays[np.argmax(correlations)]
        if (last_best_delay is not None and last_best_delay==best_delay) or not step_size > 1:
            break
        
        
        last_best_delay = best_delay 
        center          = best_delay
        
        ######################
        x = list(delays)
        y = correlations
        fig = go.Figure(data=go.Scatter(x=x,y=y,
        error_y=dict(
            type='data',
            symmetric=False,
            array=los,
            arrayminus=his)
        )
        ,layout = go.Layout(autosize=False,width=800,height=500)               
        )
        title = f"Cross−correlation function for RSDA ({config['transatlantic']['signal']}) for {n1} vs {n2} markets {('(iteration' + it + ')') if it > 0 else ''}"
        fig.update_layout(
        title=title,
        xaxis_title="lag (in ms)",
        yaxis_title="cross-correlation"
        )
        save_plot(fig, f"Correlation_vs_lag_iteration({it})_market({n1}_{n2})")
        fig.show()
        
        print(f"idx:{np.argmax(correlations)}, step_size:{step_size}, center:{center}")
        if increasing_function_check(correlations):
            step_size = int(step_size*1.5)
        else:
            step_size = step_size//2
        


In [20]:
find_best_delay_demo("US","US")


divide by zero encountered in arctanh



idx:20, step_size:120000, center:0


In [23]:
find_best_delay_demo("GB","NL")

idx:20, step_size:120000, center:0


# Visu

In [None]:
for i,k1 in enumerate(daily_data):
    for j,k2 in enumerate(daily_data):
        delays,correlations,los,his = compute_delays(daily_data,n1=k1,n2=k2,center=0,step_size=1000_000)
        x = list(delays)
        y = correlations
        errors = [los,his]
        plt.errorbar(x, y, yerr=errors, fmt='o')
        plt.title(f"{k1},{k2} ({delays[np.argmax(correlations)]})")
        plt.xlabel("delay")
        plt.ylabel("corr")
        plt.show()

In [None]:
from helpers.algorithm import find_best_delay
for i,k1 in enumerate(daily_data):
    for j,k2 in enumerate(daily_data):
        best_delay,delays,correlations,los,his = find_best_delay(daily_data,n1=k1,n2=k2)
        x = list(delays)
        y = correlations
        errors = [los,his]
        plt.errorbar(x, y, yerr=errors, fmt='o')
        plt.title(f"{k1},{k2},{best_delay}")
        plt.xlabel("delay")
        plt.ylabel("corr")
        plt.show()