In [41]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pylab as plt
import datetime
from scipy import integrate
from tick.hawkes import *
from scipy.optimize import *
from tick.plot import plot_hawkes_kernels
from scipy import stats
import statsmodels.api as sm

%matplotlib inline

In [2]:
def csv_reader(name):
    df = pd.read_csv(name,header=None)

    df.set_index(0)

    df.drop(df.columns[0], axis = 1, inplace=True)
    df.columns = df.iloc[0]
    df = df.iloc[1:]

    df["Bid_time"] = df["Bid_time"].astype(float)
    df["Mid_IV"] = df["Mid_IV"].astype(float)
    df["Mid_price"] = df["Mid_price"].astype(float)
    return df
    

In [3]:
def single_exp(decays, events):
        return - HawkesExpKern(decays=decays[0], penalty='elasticnet', tol=1e-8,
                          elastic_net_ratio=0.9, max_iter=1000).fit(events).score()

In [4]:
def sum_exp(decays, events):
        return - HawkesSumExpKern(decays=decays, penalty='elasticnet', tol=1e-8,
                          elastic_net_ratio=0.9, max_iter=1000).fit(events).score()

In [5]:
def sum_3exp_minimiser(x, timestamps):
    return minimize(sum_exp, x0 = [x]*3, args = (timestamps), method = 'Nelder-Mead', tol =1e-5)

In [6]:
def exp_minimiser(x,timestamps):
    return minimize(single_exp, x0 = [x], args = (timestamps), method = 'Nelder-Mead', tol =1e-5)

In [7]:
def grid_search(fun, start, stop, tol, timestamps, init = -np.inf, init_betas = 0):
    
    N = 20
    
    grid = np.linspace(start,stop, N)
    print(f"Grid: {start} : {stop}")
    best_result = init
    best_betas = init_betas

    for i in range(N):
        try:
            optimised = fun(grid[i],timestamps)
            result = -optimised.fun 
            betas = optimised.x
            if result > best_result:
                best_betas = betas
                best_initial_value = grid[i]
                if result - best_result < tol:
                    return best_betas
                best_result = result
        except:
            print("Iteration {} erred".format(i))
            
    #sometimes it doesn't improve at all during the search, so some variables might not be defined
    try:
        next_start = best_initial_value*0.8
        next_stop = best_initial_value*1.2
        print(f"Best value so far: {best_result} found at {best_initial_value}")
        print(f"Optimal betas so far: {best_betas}")
        return grid_search(fun, next_start, next_stop, tol, timestamps, init = best_result, init_betas = best_betas)
    
    except:
        return best_betas   
    

In [13]:
def resid(x, intensities, timestamps, dim, method):
    print(dim)
    arrivals = timestamps[dim]
    thetas = np.zeros(len(arrivals) - 1)
    ints = intensities[dim]
    for i in range(1, len(arrivals)):
        mask = (x <= arrivals[i]) & (x >= arrivals[i - 1])
        xs = x[mask]
        ys = ints[mask]
        try:
            thetas[i - 1] = method(ys, xs)
        except:
            thetas[i - 1] = np.nan

    return thetas

def goodness_of_fit_par(learner, arrivals, step, method):
    dimension = learner.n_nodes
    intensities = learner.estimated_intensity(arrivals, step)[0]
    x = learner.estimated_intensity(arrivals, step)[1]
    residuals = [resid(x, intensities, arrivals, dim, method) for dim in range(dimension)]
    return residuals

def ks_test(resid):
    for res in resid:
        print(stats.kstest(res[np.logical_not(np.isnan(res))], 'expon'))

def plot_resid(resid, rows, cols):
    fig, axes = plt.subplots(nrows=rows, ncols=cols)
    fig.subplots_adjust(hspace=0.5)
    fig.suptitle('Goodness-of-fit for nonparametric HP')

    for ax, res in zip(axes, resid):
        k = stats.probplot(res, dist=stats.expon, fit=True, plot=ax, rvalue=False)
        ax.plot(k[0][0], k[0][0], 'k--')

def ks_test(resid):
    return [
        stats.kstest(res[np.logical_not(np.isnan(res))], 'expon')
        for res in resid
    ]

def lb_test(resid):
    return [
        sm.stats.acorr_ljungbox(res[np.logical_not(np.isnan(res))], lags=[3], return_df=True)
        for res in resid
    ]

def ed_test(resid):
    results = []
    for res in resid:
        res_ = res[np.logical_not(np.isnan(res))]
        results.append(
            np.sqrt(len(res_)) * (np.var(res_, ddof=1) - 1) / np.sqrt(8)
        )
    return results

# AAPL Data

In [8]:
session = "AM"
# session = "MID"
# session = "PM"

n_minus_ts_ticks_25dc = csv_reader(f'final_dataset/{session}/plus_minus/n_minus_ts_ticks_60dc.csv')
n_minus_ts_ticks_25dp = csv_reader(f'final_dataset/{session}/plus_minus/n_minus_ts_ticks_60dp.csv')
n_minus_ts_ticks_50dc = csv_reader(f'final_dataset/{session}/plus_minus/n_minus_ts_ticks_50dc.csv')

n_plus_ts_ticks_25dc = csv_reader(f'final_dataset/{session}/plus_minus/n_plus_ts_ticks_60dc.csv')
n_plus_ts_ticks_25dp = csv_reader(f'final_dataset/{session}/plus_minus/n_plus_ts_ticks_60dp.csv')
n_plus_ts_ticks_50dc = csv_reader(f'final_dataset/{session}/plus_minus/n_plus_ts_ticks_50dc.csv')

In [11]:
all_data = [
            n_plus_ts_ticks_25dc, n_minus_ts_ticks_25dc,
            n_plus_ts_ticks_50dc, n_minus_ts_ticks_50dc,
            n_plus_ts_ticks_25dp, n_minus_ts_ticks_25dp,
]

all_timestamps = [
    np.array(list(df['Bid_time']))
    for df in all_data
]
    
max_min_time = max([min(ts_df) for ts_df in all_timestamps])
min_max_time = min([max(ts_df) for ts_df in all_timestamps])

trimmed_ts = [
    ts[(max_min_time < ts) & (ts < min_max_time)] - max_min_time
    for ts in all_timestamps
]

for ts_ in trimmed_ts:
    print(ts_[0])
    print("Increasing: {}".format(np.all(np.diff(ts_) > 0)))

print()
print("Before : {}".format(max([len(ts) for ts in all_timestamps])))
print("After : {}".format(max([len(ts) for ts in trimmed_ts])))
all_timestamps = trimmed_ts

203.0
Increasing: True
215.0
Increasing: True
262.0
Increasing: True
126.0
Increasing: True
650.0
Increasing: True
401.0
Increasing: True

Before : 10487
After : 10467


# Single Expo

In [12]:
start = 0.05
stop = 1.5

best_beta = grid_search(exp_minimiser, start = start, stop = stop, tol=1e-7, timestamps=all_timestamps)

Grid: 0.05 : 1.5


  y[:] = x + (prev_t - 1) / t * (x - prev_x)


Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0


  + 1. / (2 * step) * norm(x - y) ** 2


Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Best value so far: 0.02195200985875521 found at 1.0421052631578946
Optimal betas so far: [0.08024414]
Grid: 0.8336842105263158 : 1.2505263157894735
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0...

In [71]:
BEST_BETA = best_beta[0]

learner = HawkesExpKern(decays=BEST_BETA,tol=1e-10, penalty='elasticnet', 
                          elastic_net_ratio=0.9, max_iter=2000)

learner.fit(all_timestamps)
residuals = goodness_of_fit_par(learner, all_timestamps, 1, integrate.trapz)

0
1
2
3
4
5


In [70]:
alphas = learner.adjacency
betas = learner.decays
baseline = learner.baseline
loglik = learner.score()

print("Alphas: ")
print(np.round(alphas, 2))

print("\n Baseline: ")
print(baseline)

table = {
    "Endogeneity": [max(np.linalg.eigvals(alphas))],
    "Likelihood": [loglik],
    "Max KS Test": max([ks.pvalue for ks in ks_test(residuals)]),
    "Max LB Test": max([list(lb.lb_pvalue)[0] for lb in lb_test(residuals)]),
    "Min ED Statistic": min(ed_test(residuals))
}

df = pd.DataFrame(table)
df

Alphas: 
[[0.1  0.41 0.01 0.   0.   0.  ]
 [0.48 0.05 0.01 0.01 0.   0.  ]
 [0.05 0.04 0.   0.42 0.   0.  ]
 [0.04 0.05 0.46 0.   0.   0.  ]
 [0.04 0.05 0.   0.01 0.   0.34]
 [0.03 0.04 0.01 0.   0.34 0.  ]]

 Baseline: 
[0.0029315  0.00272525 0.00461872 0.0041675  0.00294402 0.00301819]


Unnamed: 0,Endogeneity,Likelihood,Max KS Test,Max LB Test,Min ED Statistic
0,0.535436,0.021955,2.165217e-10,1.3423389999999999e-78,19.943224


# Multi

In [None]:
start = 0.05
stop = 1.5

best_set_of_betas = grid_search(sum_3exp_minimiser, start = start, stop = stop, tol=1e-7, timestamps=all_timestamps)

Grid: 0.05 : 1.5


  y[:] = x + (prev_t - 1) / t * (x - prev_x)


Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equal

  + 1. / (2 * step) * norm(x - y) ** 2
  + 1. / (2 * step) * norm(x - y) ** 2


Step equals 0... at 0
Step equals 0... at 0
Iteration 15 erred
Iteration 16 erred
Iteration 17 erred
Iteration 18 erred
Iteration 19 erred
Best value so far: 0.022835768943127107 found at 0.43157894736842106
Optimal betas so far: [0.58624394 0.01973935 0.11677535]
Grid: 0.3452631578947369 : 0.5178947368421053
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step eq

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Best value so far: 0.0228398417767674 found at 0.4179501385041552
Optimal betas so far: [0.01878472 0.11902319 0.60941468]
Grid: 0.3343601108033242 : 0.5015401662049862
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... at 0
Step equals 0... 

In [None]:
BEST_BETA = best_set_of_betas

learner = HawkesSumExpKern(decays=BEST_BETA,tol=1e-10, penalty='elasticnet', 
                          elastic_net_ratio=0.9, max_iter=2000)

learner.fit(all_timestamps)
residuals = goodness_of_fit_par(learner, offset_timestamps, 1, integrate.trapz)

In [None]:
alphas = learner.adjacency
betas = learner.decays
baseline = learner.baseline
loglik = learner.score()

print("Alphas: ")
print(np.round(alphas, 2))

print("\n Baseline: ")
print(baseline)

table = {
    "Endogeneity": [max(np.linalg.eigvals(alphas.sum(-1)))],
    "Likelihood": [loglik],
    "Max KS Test": max([ks.pvalue for ks in ks_test(residuals)]),
    "Max LB Test": max([list(lb.lb_pvalue)[0] for lb in lb_test(residuals)]),
    "Min ED Statistic": min(ed_test(residuals))
}

df = pd.DataFrame(table)
df