# Change detection of real-world financial time-series

In this note book, we apply our methods to a real-world financial time-series

In [None]:
import gudhi as gd
from gudhi.representations import Landscape
import numpy as np
import matplotlib.pyplot as plt
import pandas_datareader.data as web
from functools import partial

In [None]:
from mdl.model import Norm1D
from mdl.smdl import SMDL
from bocpd.mybocpd import BOCD, StudentT, constant_hazard
from mdl.ppm import get_K_mu_sigma
from mdl.wkc import get_WKC
from utils.evaluation import calc_auc_average, calc_falarms_benefit, InvRunLen, get_evaluation, get_threshold
from utils.embedding import TimeDelayEmbedding

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Download Dataset
We download a financial time-series dataset, which consists of the S&P500 market price.

In [None]:
df = web.DataReader('^GSPC', 'yahoo', start='2018-01-01', end='2021-12-31')
df.reset_index(inplace=True)
data = np.diff(np.log(df["Close"]))*10

In [None]:
plt.plot(df["Close"])

## Time-delay embedding
Apply time-delay embedding to the time-series data and convert it to a series of three-dimensional point clouds. 

In [None]:
TimeDelay = TimeDelayEmbedding(150,1,1)
ex_data = TimeDelay(data,0)
TimeDelay = TimeDelayEmbedding(3,5,1)
use_data = TimeDelay(ex_data,1)

## Number of optimal components in Persistence Parametric Model
We apply the PPM method to the PDs of the point clouds.

In [None]:
Ks = []
max_K = 7
b = 100
for i in range(len(use_data)):
    ob_data = use_data[i]
    rips_complex = gd.RipsComplex(points=ob_data)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=2)
    diag = simplex_tree.persistence()
    A = simplex_tree.persistence_intervals_in_dimension(1)
    K, mu, sigma = get_K_mu_sigma(A, max_K, b)
    Ks.append(K)

We smooth the series of the number of mixture components and apply Bayesian online change point detection (BOCPD).

In [None]:
smooth = 3
smooth_Ks = [0]*(smooth-1)
for i in range(smooth-1,len(Ks)):
    smooth_Ks.append(np.mean(Ks[i-smooth+1:i+1]))

In [None]:
ALPHA = 0.1
BETA = 1.0
KAPPA = 1.0
MU = 0.0
DELAY = 15
N_trial = 1

for LAMBDA in [5]:
    for THRESHOLD in [0.3]:
        scores_bocpd = []
        for i in range(N_trial):
            X = smooth_Ks

            # BOCPD
            bocd = BOCD(partial(constant_hazard, LAMBDA),
                        StudentT(ALPHA, BETA, KAPPA, MU), X)
            change_points = []
            scores_PPM = [np.nan] * DELAY
            for x in X[:DELAY]:
                bocd.update(x)
            for x in X[DELAY:]:
                bocd.update(x)
                if bocd.growth_probs[DELAY] >= THRESHOLD:
                    change_points.append(bocd.t - DELAY + 1)
                score = np.sum(bocd.growth_probs[:bocd.t - DELAY] * 1.0 / (1.0 + np.arange(1, bocd.t - DELAY + 1)))
                scores_PPM.append(score)

We evaluate the detection result.

In [None]:
Dates = ["2018-09-24","2018-12-03","2019-05-06","2019-08-01","2020-02-28","2020-04-24","2020-09-29","2021-03-31","2021-09-22"]
T = 40
true_change_points = []
for Date in Dates:
    true_change_points.append(df[df["Date"]==Date].index[0]-150)

In [None]:
detect_points_PPM = []
data_points_PPM = []
detections_PPM = []
beta = get_threshold(scores_PPM, true_change_points, T)
for i in range(len(scores_PPM)):
    if scores_PPM[i]>beta:
        detections_PPM.append(i)
        detect_points_PPM.append(len(data)-len(smooth_Ks)+i)
        data_points_PPM.append(df["Close"][detect_points_PPM[-1]])

In [None]:
fig = plt.figure(figsize=(20,8))
ax1 = fig.add_subplot(2,1,1)
ax1.plot(df["Date"],df["Close"])
ax1.scatter(df["Date"].iloc[detect_points_PPM],data_points_PPM,color="red")
ymin = 2000
ymax = 4900
ax1.set_ylim(ymin,ymax)
ax1.vlines(Dates, ymin, ymax, "green", linestyles='dashed')
plt.title("S&P500")
ax2 = fig.add_subplot(2,1,2,sharex=ax1)
ymin = 1.0
ymax = 7.0
ax2.set_ylim(ymin,ymax)
ax2.plot(df["Date"].iloc[150:],smooth_Ks)
ax2.vlines(Dates, ymin, ymax, "green", linestyles='dashed')
plt.title("The number of mixture (smoothed)")
ax1.grid(which = "major", axis = "x", color = "black", alpha = 0.2,linestyle = "--", linewidth = 1)
ax2.grid(which = "major", axis = "x", color = "black", alpha = 0.2,linestyle = "--", linewidth = 1)
plt.savefig("real_data_1_2.png",facecolor="white")
plt.show()

In [None]:
benefit, acc_rate = get_evaluation(detections_PPM, true_change_points, T)
print("benefit: {}, rate of accurate alerts: {}".format(benefit, acc_rate))

## Kernel Complexity of Persistence Non-Parametric Model
We apply the PNPM method to the PDs of the point clouds.

In [None]:
KCs_PNPM = []
epsilon = 0.1
gamma = 0.7
param = 1.0
for i in range(len(use_data)):
    ob_data = use_data[i]
    rips_complex = gd.RipsComplex(points=ob_data)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension = 2)
    diag = simplex_tree.persistence()
    A = simplex_tree.persistence_intervals_in_dimension(1)
    x1 = np.append(np.array([A.T[0]]),[A.T[1]-A.T[0]],axis=0)
    x = x1.T
    n = len(x)
    m = len(x[0])
    if len(x) > 0:
        KC = get_WKC(x, n, m, gamma, epsilon, param)
        KCs_PNPM.append(KC)
    else:
        KCs_PNPM.append(0)

We apply sequential MDL-change statistics (SMDL) to the series of the kernel complexity of PNPM.

In [None]:
h = 8
mu_max = 50.0
sigma_min = 0.005

scores_list_0th = []
scores_list_1st = []
scores_list_2nd = []

X = np.array(KCs_PNPM)
len_X = len(X)
    
norm1d = Norm1D()
smdl = SMDL(norm1d)

scores_0th = np.array([np.nan]*h + [ smdl.calc_change_score(X[(t-h):(t+h)], h, mu_max=mu_max, sigma_min=sigma_min) \
                                    for t in range(h, len_X-h)] + [np.nan]*h)
scores_list_0th.append(scores_0th)
    
scores_list_0th = np.array(scores_list_0th)
scores_PNPM = scores_list_0th[0]

We evaluate the detection result.

In [None]:
Dates = ["2018-09-24","2018-12-03","2019-05-06","2019-08-01","2020-02-28","2020-04-24","2020-09-29","2021-03-31","2021-09-22"]
T = 40
true_change_points = []
for Date in Dates:
    true_change_points.append(df[df["Date"]==Date].index[0]-150)

In [None]:
detect_points_PNPM = []
data_points_PNPM = []
detections_PNPM = []
beta = get_threshold(scores_PNPM, true_change_points, T)
for i in range(len(scores_PNPM)):
    if scores_PNPM[i]>beta:
        detections_PNPM.append(i)
        detect_points_PNPM.append(len(data)-len(KCs_PNPM)+i)
        data_points_PNPM.append(df["Close"][detect_points_PNPM[-1]])

In [None]:
fig = plt.figure(figsize=(20,8))
ax1 = fig.add_subplot(2,1,1)
ax1.plot(df["Date"],df["Close"])
ax1.scatter(df["Date"].iloc[detect_points_PNPM],data_points_PNPM,color="red")
ymin = 2000
ymax = 4900
ax1.set_ylim(ymin,ymax)
ax1.vlines(Dates, ymin, ymax, "green", linestyles='dashed')
plt.title("S&P500")
ax2 = fig.add_subplot(2,1,2,sharex=ax1)
ymin = 0.05
ymax = 0.8
ax2.set_ylim(ymin,ymax)
ax2.plot(df["Date"].iloc[150:],KCs_PNPM)
ax2.vlines(Dates, ymin, ymax, "green", linestyles='dashed')
plt.title("Kernel Complexity")
ax1.grid(which = "major", axis = "x", color = "black", alpha = 0.2,linestyle = "--", linewidth = 1)
ax2.grid(which = "major", axis = "x", color = "black", alpha = 0.2,linestyle = "--", linewidth = 1)
plt.show()

In [None]:
benefit, acc_rate = get_evaluation(detections_PNPM, true_change_points, T)
print("benefit: {}, rate of accurate alerts: {}".format(benefit, acc_rate))

## Comparison to existing methods
Below we apply several existing methods to the time-series for comparison.

### L2 norm of persistence landscape

In [None]:
L2_norms = []
for i in range(len(use_data)):
    ob_data = use_data[i]
    rips_complex = gd.RipsComplex(points=ob_data)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=2)
    simplex_tree.persistence()
    A = simplex_tree.persistence_intervals_in_dimension(1)
    x1 = np.append(np.array([A.T[0]]),[A.T[1]-A.T[0]],axis=0)
    x = x1.T
    LS = Landscape(num_landscapes=3,resolution=1000)
    L = LS.fit_transform([simplex_tree.persistence_intervals_in_dimension(1)])
    L2 = 0
    L2 += pow(np.linalg.norm(L[0][:1000],ord=2),2)
    L2 += pow(np.linalg.norm(L[0][1000:2000],ord=2),2)
    L2 += pow(np.linalg.norm(L[0][2000:3000],ord=2),2)
    L2_norms.append(pow(L2,1/2))

In [None]:
h = 8
mu_max = 1
sigma_min = 0.005

scores_list_0th = []
scores_list_1st = []
scores_list_2nd = []

X = np.array(L2_norms)
len_X = len(X)
    
norm1d = Norm1D()
smdl = SMDL(norm1d)

scores_0th = np.array([np.nan]*h + [ smdl.calc_change_score(X[(t-h):(t+h)], h, mu_max=mu_max, sigma_min=sigma_min) \
                                    for t in range(h, len_X-h)] + [np.nan]*h)
scores_list_0th.append(scores_0th)
    
scores_list_0th = np.array(scores_list_0th)
scores_PL = scores_list_0th[0]

In [None]:
Dates = ["2018-09-24","2018-12-03","2019-05-06","2019-08-01","2020-02-28","2020-04-24","2020-09-29","2021-03-31","2021-09-22"]
T = 40
true_change_points = []
for Date in Dates:
    true_change_points.append(df[df["Date"]==Date].index[0]-150)

In [None]:
detect_points_PL = []
data_points_PL = []
detections_PL = []
beta = get_threshold(scores_PL, true_change_points, T)
for i in range(len(scores_PL)):
    if scores_PL[i]>beta:
        detections_PL.append(i)
        detect_points_PL.append(len(data)-len(L2_norms)+i)
        data_points_PL.append(df["Close"][detect_points_PL[-1]])

In [None]:
fig = plt.figure(figsize=(20,8))
ax1 = fig.add_subplot(2,1,1)
ax1.plot(df["Date"],df["Close"])
ax1.scatter(df["Date"].iloc[detect_points_PL],data_points_PL,color="red")
ymin = 2000
ymax = 4900
ax1.set_ylim(ymin,ymax)
ax1.vlines(Dates, ymin, ymax, "green", linestyles='dashed')
plt.title("S\&P500")
ax2 = fig.add_subplot(2,1,2,sharex=ax1)
ymin = 0.0
ymax = 3.0
ax2.set_ylim(ymin,ymax)
ax2.plot(df["Date"].iloc[150:],L2_norms)
ax2.vlines(Dates, ymin, ymax, "green", linestyles='dashed')
plt.title("L2 norm")
ax1.grid(which = "major", axis = "x", color = "black", alpha = 0.2,linestyle = "--", linewidth = 1)
ax2.grid(which = "major", axis = "x", color = "black", alpha = 0.2,linestyle = "--", linewidth = 1)
plt.show()

In [None]:
benefit, acc_rate = get_evaluation(detections_PL, true_change_points, T)
print("benefit: {}, rate of accurate alerts: {}".format(benefit, acc_rate))

### Sequential MDL-change statistics (SMDL)

In [None]:
h = 18
mu_max = 50.0
sigma_min = 0.005

scores_list_0th = []
scores_list_1st = []
scores_list_2nd = []

X = np.array(data)
len_X = len(X)
    
norm1d = Norm1D()
smdl = SMDL(norm1d)

scores_0th = np.array([np.nan]*h + [ smdl.calc_change_score(X[(t-h):(t+h)], h, mu_max=mu_max, sigma_min=sigma_min) \
                                    for t in range(h, len_X-h)] + [np.nan]*h)
scores_list_0th.append(scores_0th)

scores_list_0th = np.array(scores_list_0th)
scores_SMDL = scores_list_0th[0]

In [None]:
T = 40
true_change_points_original = []
for Date in Dates:
    true_change_points_original.append(df[df["Date"]==Date].index[0])

In [None]:
detect_points_SMDL = []
data_points_SMDL = []
detections_SMDL = []
beta = get_threshold(scores_SMDL, true_change_points_original, T)
for i in range(len(scores_SMDL)):
    if scores_SMDL[i]>beta:
        detections_SMDL.append(i)
        detect_points_SMDL.append(len(data)-len(scores_SMDL)+i)
        data_points_SMDL.append(df["Close"][detect_points_SMDL[-1]])

In [None]:
true_detections_SMDL = np.array(detections_SMDL)
benefit, acc_rate = get_evaluation(true_detections_SMDL[true_detections_SMDL>150], true_change_points_original, T)
print("benefit: {}, rate of accurate alerts: {}".format(benefit, acc_rate))

### Bayesian online change point detection (BOCPD)

In [None]:
ALPHA = 0.1
BETA = 1.0
KAPPA = 1.0
MU = 0.0
DELAY = 15
N_trial = 1

for LAMBDA in [3]:
    for THRESHOLD in [0.3]:
        scores_bocpd = []
        for i in range(N_trial):
            X = data

            # BOCPD
            bocd = BOCD(partial(constant_hazard, LAMBDA),
                        StudentT(ALPHA, BETA, KAPPA, MU), X)
            change_point = []
            scores_BOCPD = [np.nan] * DELAY
            for x in X[:DELAY]:
                bocd.update(x)
            for x in X[DELAY:]:
                bocd.update(x)
                if bocd.growth_probs[DELAY] >= THRESHOLD:
                    change_point.append(bocd.t - DELAY + 1)
                score = np.sum(bocd.growth_probs[:bocd.t - DELAY] * 1.0 / (1.0 + np.arange(1, bocd.t - DELAY + 1)))
                scores_BOCPD.append(score)

In [None]:
T = 40
true_change_points_original = []
for Date in Dates:
    true_change_points_original.append(df[df["Date"]==Date].index[0])

In [None]:
detect_points_BOCPD = []
data_points_BOCPD = []
detections_BOCPD = []
beta = get_threshold(scores_BOCPD, true_change_points_original, T)
for i in range(len(scores_BOCPD)):
    if scores_BOCPD[i]>beta:
        detections_BOCPD.append(i)
        detect_points_BOCPD.append(len(data)-len(scores_BOCPD)+i)
        data_points_BOCPD.append(df["Close"][detect_points_BOCPD[-1]])

In [None]:
benefit, acc_rate = get_evaluation(detections_BOCPD, true_change_points_original, T)
print("benefit: {}, rate of accurate alerts: {}".format(benefit, acc_rate))