In [1]:
%matplotlib inline

from scipy.stats import chi2, norm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.optimize import minimize, least_squares
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Task 2

## 1-month Forward

In [2]:
names = ['P', 'F', 'Y', 'Pf', 'Ff', 'Yy']

data1 = pd.read_csv('FwdSpot1.dat', header=None,
                    sep=' ', skipinitialspace=True).loc[:, 2:] \
                    .rename({i + 2: name for i, name in enumerate(names)},
                            axis=1).apply(np.log)

k = 1

st = data1.iloc[:, :3]
dst = (st.shift(-k) - st.values).dropna().values

ft = data1.iloc[:, 3:]
dft = (ft - st.values).iloc[:dst.shape[0]].values

In [3]:
coef_names = [['a1', 'b1'],
              ['a2', 'b2'],
              ['a3', 'b3']]

R = np.array([[0, 1], [1, 0]])
q = np.array([1, 0])
n = dft.shape[0]
l = 15
b = 10000

In [4]:
coef_names

[['a1', 'b1'], ['a2', 'b2'], ['a3', 'b3']]

In [5]:
def get_beta(X, y):
  xtxi = np.linalg.inv(X.T @ X)
  return xtxi @ X.T @ y

def get_V(coefs, X, y):
  yh = X @ coefs
  eh = y - yh
  n = X.shape[0]

  Q = X.T @ X / n
  Qi = np.linalg.inv(Q)

  V = X.T @ np.diag(eh ** 2) @ X / n
  return Qi @ V @ Qi

def get_Wald(V, coefs, R, q, n):

  Q = R @ coefs - q
  W = n * Q.T @ np.linalg.inv(R @ V @ R.T) @ Q
  return W, chi2.ppf(0.95, 2)

def get_Wald_boot(V, coefs, coefs0, R, q, n):

  Q = R @ coefs - R @ coefs0
  W = n * Q.T @ np.linalg.inv(R @ V @ R.T) @ Q
  return W

def step(X, y, R, q, beta0, n):
  beta = get_beta(X, y)
  V = get_V(beta, X, y)
  return get_Wald_boot(V, beta, beta0, R, q, n)

def get_bootstrap_Wald(X, y, R, q, n, b, l):
  T = n // l * l

  # seq = np.arange(T)
  # slices = np.lib.stride_tricks.as_strided(seq, (T - l, l), 2 * seq.strides)
  # slicesX = X[slices]
  # slicesy = y[slices]

  beta0 = get_beta(X, y)

  walds = []

  for i in range(b):
    blocks = np.random.choice(T - l, size=T // l)
    Xb = []
    yb = []

    for block in blocks:
      Xb += list(X[block:block + l])
      yb += list(y[block:block + l])

    Xb = np.array(Xb)
    yb = np.array(yb)

    # Xb = slicesX[blocks].reshape(-1, X.shape[-1])
    # yb = slicesy[blocks].flatten()

    walds.append(step(Xb, yb, R, q, beta0, n))

  return np.quantile(walds, 0.95)

In [6]:
for i, name in enumerate(coef_names):

  print("Experiment for " + names[i])

  X = sm.add_constant(dft[:, i])
  y = dst[:, i]

  coefs = get_beta(X, y)

  for coef, name in zip(coefs, coef_names[i]):
    print(name + ": ", round(coef, 4), end = '\t')
  print()

  V = get_V(coefs, X, y)

  W, border = get_Wald(V, coefs, R, q, n)

  borderW = get_bootstrap_Wald(X, y, R, q, n, b, l)

  print("Wald Statistic for " + names[i] + " is:", round(W, 3))
  print("Asymptotic critical value for " + names[i] + " is:", round(border, 3))
  print("Bootstrap critical value for " + names[i] + " is:",
        round(borderW, 3), end="\n\n")

Experiment for P
a1:  -0.0023	b1:  -0.7261	
Wald Statistic for P is: 7.446
Asymptotic critical value for P is: 5.991
Bootstrap critical value for P is: 14.324

Experiment for F
a2:  -0.0023	b2:  -0.9606	
Wald Statistic for F is: 5.727
Asymptotic critical value for F is: 5.991
Bootstrap critical value for F is: 10.312

Experiment for Y
a3:  0.0036	b3:  -0.1528	
Wald Statistic for Y is: 4.899
Asymptotic critical value for Y is: 5.991
Bootstrap critical value for Y is: 16.564



## 3-month Forward

In [7]:
names = ['P', 'F', 'Y', 'Pf', 'Ff', 'Yy']

dota3 = pd.read_csv('FwdSpot3.dat', header=None,
                    sep=' ', skipinitialspace=True).loc[:, 2:] \
                    .rename({i + 2: name for i, name in enumerate(names)},
                            axis=1).apply(np.log)
k = 3

st = dota3.iloc[:, :3]
dst = (st.shift(-k) - st.values).dropna().values

ft = dota3.iloc[:, 3:]
dft = (ft - st.values).iloc[:dst.shape[0]].values

In [8]:
coef_names = [['a1', 'b1'],
              ['a2', 'b2'],
              ['a3', 'b3']]

R = np.array([[0, 1], [1, 0]])
q = np.array([1, 0])
n = dft.shape[0]
l = 15
b = 10000

In [9]:
def get_Gj(Z, j):
  Zm = Z.mean(axis=0)
  T = Z.shape[0]

  i_min = max(0, j)
  i_max = min(T, T + j)

  Z_t = Z[i_min:i_max] - Zm
  Z_tj = Z[i_min-j:i_max-j] - Zm

  return Z_t.T @ Z_tj / T

def get_NWV(coefs, X, y):
  yh = X @ coefs
  eh = y - yh

  Z = X * eh[:, None]
  T = X.shape[0]
  m = int(4 * (T / 100) ** (1/3))

  V = 0
  for j in range(-m, m + 1):
    V += (1 - abs(j) / (m + 1)) * get_Gj(Z, j)

  Q = X.T @ X / n
  Qi = np.linalg.inv(Q)

  return Qi @ V @ Qi

def step(X, y, R, q, beta0, n):
  beta = get_beta(X, y)
  V = get_NWV(beta, X, y)
  return get_Wald_boot(V, beta, beta0, R, q, n)

In [10]:
for i, name in enumerate(coef_names):

  print("Experiment for " + names[i])

  X = sm.add_constant(dft[:, i])
  y = dst[:, i]

  coefs = get_beta(X, y)

  for coef, name in zip(coefs, coef_names[i]):
    print(name + ": ", round(coef, 4), end = '\t')
  print()

  V = get_NWV(coefs, X, y)

  W, border = get_Wald(V, coefs, R, q, n)

  borderW = get_bootstrap_Wald(X, y, R, q, n, b, l)

  print("Wald Statistic for " + names[i] + " is:", round(W, 3))
  print("Asymptotic critical value for " + names[i] + " is:", round(border, 3))
  print("Bootstrap critical value for " + names[i] + " is:",
        round(borderW, 3), end="\n\n")


Experiment for P
a1:  -0.0187	b1:  -2.0586	
Wald Statistic for P is: 19.094
Asymptotic critical value for P is: 5.991
Bootstrap critical value for P is: 10.02

Experiment for F
a2:  -0.0044	b2:  -0.4804	
Wald Statistic for F is: 3.757
Asymptotic critical value for F is: 5.991
Bootstrap critical value for F is: 12.954

Experiment for Y
a3:  0.0131	b3:  -0.602	
Wald Statistic for Y is: 12.579
Asymptotic critical value for Y is: 5.991
Bootstrap critical value for Y is: 15.397



It can be seen that the critical values differ significantly. This may be due to a small sample size -- we only have 233 observations. Due to this small number of observations, we may also have a poor approximation of the cumulative distribution function via empirical cumulative distribution function. Also we used only 10k samples, which is small for 233 observations, this could also affect the quality. However, the more samples we have, the longer it will take to calculate. It is also possible that the data series are non-stationary or do not have the required properties to perform an asymptotic approximation using normal distribution.