# Task 0: predicting structural breaks in sequential Google Maestro data

Uses sequential data + regression. 

Potentially useful articles:
- Zhao, Kaiguang, et al. "Detecting change-point, trend, and seasonality in satellite time series data to track abrupt changes and nonlinear dynamics: A Bayesian ensemble algorithm." Remote sensing of Environment 232 (2019): 111181.
- Jiang, Yu, Zhe Song, and Andrew Kusiak. "Very short-term wind speed forecasting with Bayesian structural break model." Renewable energy 50 (2013): 637-647.
- Pesaran, M. Hashem, Davide Pettenuzzo, and Allan Timmermann. "Forecasting time series subject to multiple structural breaks." The Review of Economic Studies 73.4 (2006): 1057-1084.
- De Brouwer, Edward, et al. "Gru-ode-bayes: Continuous modeling of sporadically-observed time series." arXiv preprint arXiv:1905.12374 (2019).
- Thies, Sven, and Peter Molnár. "Bayesian change point analysis of Bitcoin returns." Finance Research Letters 27 (2018): 223-227.

In [1]:
from __future__ import division
from more_itertools import peekable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scp
import pickle
import magenta
import os, time, re, json, glob
%matplotlib inline
from IPython.core.display import display, HTML
### change width of notebook display
# display(HTML("<style>.container { width:70% !important; }</style>"))
from pathlib import Path
import ipdb;
def debug(): ipdb.set_trace() # debugging starts here

import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# for exposing API
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from flask import Flask
from flask_restful import Resource, Api

JUPYTER_PICKLE_FILE = "config/shared_jupyter_data.pkl"
def write_shared_jupyter(key, value, path=JUPYTER_PICKLE_FILE, overwrite=False):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
        if overwrite:
            shared_jupyter_data = {key: value}
        else:
            shared_jupyter_data[key] = value
    else:
        shared_jupyter_data = {key: value}
    with open(path, 'wb') as fp: 
        pickle.dump(shared_jupyter_data, fp)

def read_shared_jupyter(key=None, path=JUPYTER_PICKLE_FILE):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
            if key is not None:
                if key in shared_jupyter_data:
                    return(shared_jupyter_data[key])
                else:
                    print("Not found!")
                    return(None)
            else:
                return(shared_jupyter_data)
    else:
        print("No data")

def pandasToJson(df):
    return(df.to_json(orient="split"))
def jsonToPandas(json):
    return(pd.read_json(json, orient="split"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

FIG_WIDTH = 1200
FIG_HEIGHT = 800

PITCH_MIN = 20
PITCH_MAX = 120
VELOCITY_MIN = 0
VELOCITY_MAX = 120

def hheader(x):
    print("#########################################")
    print("### {}".format(x))
    print("#########################################")

# Magenta dependencies:
# https://github.com/magenta/magenta

# Magenta uses pretty_midi to deal with midi files
import pretty_midi

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


In [2]:
### Set up overall folder for task 1
INPUT_FOLDER = "data/maestro/maestro-v3.0.0/"
add_input_folder = lambda x: "{}/{}".format(INPUT_FOLDER, x).replace("//", "/")

OUTPUT_FOLDER = "task1_sequential_learning"
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)
add_output_path = lambda x: "{}/{}".format(OUTPUT_FOLDER, x)

print(OUTPUT_FOLDER)

task1_sequential_learning


# Step 1 - Read in the Google Maestro data (MIDI)

In [3]:
RUN_STEP_1 = False

### Set up folder for step 1
STEP1_FOLDER = add_output_path("step1")
if not os.path.exists(STEP1_FOLDER):
    os.mkdir(STEP1_FOLDER)
add_step1_path = lambda x: "{}/{}".format(STEP1_FOLDER, x)

print(STEP1_FOLDER)

task1_sequential_learning/step1


In [4]:
if RUN_STEP_1:
    ### read in MIDI metadata
    midi_metadata = pd.read_csv(add_input_folder("maestro-v3.0.0.csv"))
    midi_metadata.head()

    ### read in MIDI files
    filenames = [str(path) for path in Path('data/').rglob('*.midi')]

    all_raw_dfs = []
    all_agg_dfs = []
    for cf, curr_file in enumerate(filenames):
        curr_file_fmt = curr_file.replace("\\", "/").replace(INPUT_FOLDER, "")
        curr_file_out_seed = curr_file_fmt.replace("/","__").replace(".midi", "")
        print("Processing file {} / {}".format(cf+1, len(filenames)))

        """ Raw MIDI file + metadata """
        curr_midi = pretty_midi.PrettyMIDI(curr_file)
        solo_piano_part = curr_midi.instruments[0]
        df_notes = pd.DataFrame([(n.start, n.end, n.pitch, n.velocity, n.duration) for n in solo_piano_part.notes],
                                columns=['start', 'end', 'pitch', 'velocity', 'duration'])
        df_metadata = midi_metadata[midi_metadata['midi_filename']==curr_file_fmt]
        df_metadata = df_metadata.rename(columns={"duration": "total_duration"})
        df_metadata.drop(columns="audio_filename", inplace=True) # don't need audio for now
        curr_filename = df_metadata['midi_filename'].values[0].replace("/", "__")
        df_curr = pd.merge(df_notes, df_metadata, how="cross")
        df_curr = df_curr.sort_values(by='start')
        # df_curr.to_csv(add_step1_path(curr_file_out_seed + "_raw_" + ".csv"))
        all_raw_dfs.append(df_curr)

        """ Time bar file + metadata """
        # Do not adjust start
        EVERY_N_SEC = 1
        df_curr_time = df_curr.copy()
        df_curr_time['start_sec'] =  (df_curr_time['start'] / EVERY_N_SEC).apply(np.floor).astype(int) * EVERY_N_SEC
        tmp = df_curr_time.select_dtypes(include=[np.number])
        df_curr_time.loc[:, tmp.columns] = np.round(tmp, decimals=5)
        def str_concat(x): return(','.join([str(s) for s in x]))
        df_curr_time_agg = df_curr_time.groupby(['start_sec'], as_index=False).agg({
            'start': [len, np.min, np.mean, np.median, np.max, str_concat],
            'end': [np.min, np.mean, np.median, np.max, str_concat],
            'pitch': [np.min, np.mean, np.median, np.max, str_concat],
            'velocity': [np.min, np.mean, np.median, np.max, str_concat],
            'duration': [np.min, np.mean, np.median, np.max, str_concat],
            # metadata features - same for all observations
            'canonical_composer': [pd.Series.mode],
            'canonical_title': [pd.Series.mode],
            'split': [pd.Series.mode],
            'year': [pd.Series.mode],
            'midi_filename': [pd.Series.mode],
            'total_duration': [pd.Series.mode] })
        df_curr_time_agg.columns = ['_'.join([cc for cc in c if len(cc) > 0])
                                    .replace("amin", "min").replace("amax", "max").replace("start_len", "n_notes")
                                    for c in list(df_curr_time_agg.columns)]
        df_curr_time_agg = pd.merge(df_curr_time_agg, # add missing times
                                 pd.DataFrame(range(1, df_curr_time_agg.shape[0], 1), columns=["start_sec"]), on="start_sec",
                                how='right').sort_values(by="start_sec")
        # df_curr_time_agg.to_csv(add_step1_path(curr_file_out_seed + "_agg_" + ".csv"))
        all_agg_dfs.append(df_curr_time_agg)

In [5]:
### Write out big combined dataframes (<3 min)
fp_raw_combined = add_step1_path("maestro_raw.csv")
fp_agg_combined = add_step1_path("maestro_agg.csv")
if RUN_STEP_1:
    pd.concat([df.assign(piece_num=(dx+1)) for dx, df in enumerate(all_raw_dfs)]).to_csv(fp_raw_combined, index=False)
    pd.concat([df.assign(piece_num=(dx+1)) for dx, df in enumerate(all_agg_dfs)]).to_csv(fp_agg_combined, index=False)

# Step 2 - read in combined data

In [6]:
RUN_STEP_2 = True

### Set up folder for step 2
STEP2_FOLDER = add_output_path("step2")
if not os.path.exists(STEP2_FOLDER):
    os.mkdir(STEP2_FOLDER)
add_step2_path = lambda x: "{}/{}".format(STEP2_FOLDER, x)

print(STEP2_FOLDER)

task1_sequential_learning/step2


In [7]:
### <2 min
df_maestro_raw = pd.read_csv(fp_raw_combined)
df_maestro_agg = pd.read_csv(fp_agg_combined)

print(df_maestro_raw.shape)
display(df_maestro_raw.head())
print(df_maestro_agg.shape)
display(df_maestro_agg.head())
# all pieces start at least at start second 1 or later
# (df_maestro_agg.start_min - df_maestro_agg.start_sec).describe()

(7040164, 12)


Unnamed: 0,start,end,pitch,velocity,duration,canonical_composer,canonical_title,split,year,midi_filename,total_duration,piece_num
0,1.092708,1.189583,71,60,0.096875,Johann Sebastian Bach,French Suite No. 5 in G Major,train,2004,2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORI...,968.256759,1
1,1.279167,1.496875,55,44,0.217708,Johann Sebastian Bach,French Suite No. 5 in G Major,train,2004,2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORI...,968.256759,1
2,1.288542,1.79375,71,54,0.505208,Johann Sebastian Bach,French Suite No. 5 in G Major,train,2004,2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORI...,968.256759,1
3,1.463542,1.63125,59,55,0.167708,Johann Sebastian Bach,French Suite No. 5 in G Major,train,2004,2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORI...,968.256759,1
4,1.633333,1.753125,62,52,0.119792,Johann Sebastian Bach,French Suite No. 5 in G Major,train,2004,2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORI...,968.256759,1


(670870, 32)


Unnamed: 0,start_sec,n_notes,start_min,start_mean,start_median,start_max,start_str_concat,end_min,end_mean,end_median,end_max,end_str_concat,pitch_min,pitch_mean,pitch_median,pitch_max,pitch_str_concat,velocity_min,velocity_mean,velocity_median,velocity_max,velocity_str_concat,duration_min,duration_mean,duration_median,duration_max,duration_str_concat,canonical_composer_mode,split_mode,year_mode,total_duration_mode,piece_num
0,1,9.0,1.09271,1.590392,1.63333,1.98333,"1.09271,1.27917,1.28854,1.46354,1.63333,1.7864...",1.18958,1.812616,1.79375,2.52292,"1.18958,1.49688,1.79375,1.63125,1.75312,1.8281...",55.0,65.333333,67.0,74.0,715571596272677457,44.0,58.444444,56.0,76.0,604454555276566861,0.04167,0.222222,0.16771,0.53958,"0.09687,0.21771,0.50521,0.16771,0.11979,0.0416...",Johann Sebastian Bach,train,2004.0,968.25676,1
1,2,11.0,2.0375,2.430494,2.52292,2.8875,"2.0375,2.09792,2.14896,2.17188,2.32812,2.52292...",2.10625,2.691477,2.58958,3.44792,"2.10625,2.18229,2.51979,2.28021,2.50938,2.5895...",59.0,68.454545,71.0,74.0,7274726766716472745962,35.0,57.636364,58.0,77.0,7751605758683547685063,0.06667,0.260985,0.18125,0.76875,"0.06875,0.08438,0.37083,0.10833,0.18125,0.0666...",Johann Sebastian Bach,train,2004.0,968.25676,1
2,3,7.0,3.08021,3.571726,3.61979,3.99271,"3.08021,3.25,3.45312,3.61979,3.79896,3.80729,3...",3.27292,4.028869,3.96667,5.32812,"3.27292,5.32812,3.60208,3.80625,4.02604,3.9666...",59.0,70.571429,71.0,79.0,66677174785979,58.0,67.571429,66.0,78.0,63586678775873,0.14896,0.457143,0.19271,2.07812,"0.19271,2.07812,0.14896,0.18646,0.22708,0.1593...",Johann Sebastian Bach,train,2004.0,968.25676,1
3,4,8.0,4.17292,4.532163,4.521355,4.91979,"4.17292,4.18229,4.34896,4.51875,4.52396,4.6968...",4.33333,4.732423,4.694795,5.30729,"4.34271,4.33333,4.52083,4.68438,4.70521,4.8979...",57.0,69.5,73.0,79.0,7660797459797257,47.0,63.375,65.5,76.0,7654687059637047,0.15104,0.200261,0.17292,0.3875,"0.16979,0.15104,0.17188,0.16563,0.18125,0.2010...",Johann Sebastian Bach,train,2004.0,968.25676,1
4,5,7.0,5.07812,5.449553,5.44062,5.81042,"5.07812,5.27188,5.27812,5.44062,5.63333,5.6343...",5.25833,5.730654,5.63542,6.35833,"5.25833,5.45104,5.61458,5.63542,5.79688,6.3583...",55.0,69.714286,71.0,81.0,71726681725571,61.0,69.428571,70.0,73.0,72736173706770,0.16354,0.281101,0.18958,0.72396,"0.18021,0.17917,0.33646,0.19479,0.16354,0.7239...",Johann Sebastian Bach,train,2004.0,968.25676,1


Raw is basically a dataframe of all the raw irregular time series (concatenated), while agg is basically a processed version that is by every 1 second, 2 second, etc. (still aligned with raw file timestamps, NOT realigned).

# Step 3 - modeling

In [8]:
### Set the time series to predict etc.
ts = df_maestro_agg[['pitch_mean']]

### Can always do multivariate, other time series, etc. later.

### Model 1: Kalman filter + CUSUM for structural breaks

Source:
- Puhm, Martin, et al. "A Near Real-Time Method for Forest Change Detection Based on a Structural Time Series Model and the Kalman Filter." Remote Sensing 12.19 (2020): 3135.

In [10]:
""" Model parameters """

from scipy.stats import multivariate_normal
from statsmodels.tsa.stattools import acovf
from scipy.ndimage.interpolation import shift
np.random.seed(42)
np.set_printoptions(suppress=True) 
def pandas_fill(arr):
    df = pd.DataFrame(arr)
    df = df.fillna(method='ffill', axis=1, inplace=False)
    out = df.to_numpy()
    return out

# for now, keep the noise matrices as constants (can improve later)
# observations x: note, different dimension from Z (unlike sample code)
df_X = None
x_obs = None
[n,d] = [None, None]
x_obs_nonna = None
# number of lags (minus one)
L = 14
### Latent state equation - vector z_t
# z_t = A_t z_{t-1} + <other regressors> + w_t
At = np.eye(L) # (L x L) - paper says to leave this as identity (random walk)
Qt = np.diag(np.ones(L)) # (L x L) covariance of errors in state eq. - TUNE THIS
### Observation equation - scalar x_t
# x_t = C_t z_t + <other regressors> + v_t
Ct = np.ones((1,L)) # (1 x L) - ARIMA lags - WILL OVERRIDE
Rt = np.diag(np.ones(1)) # (1 x 1)
# Simulate some plausible values for Qt and Rt
# (could make Rt self-updating if time)
Qt = np.diag(np.random.gamma(1,0.1,size=L)) # covar. mat is symmetric
Rt = np.random.gamma(3, 4, size=np.diag(np.ones(1)).shape)
# Initial conditions (for the state variable z, stored in mu vars)
# z0 = x_obs[1:(L+1)]
# initialize to equal weight on all lags
z0 = np.repeat(1/L, L).reshape(-1, 1) # make state variable the WEIGHTS on lags NOT the LAGS themselves
P0 = Qt
#Objects to store predictions and filtering locations
Z = None
Zpred = None
Xpred = None
# store predictions
z = z0 # predicted zhat, ()
P = P0 # covariance of zhat, dim(L, L)
Ct = Ct0 = None


""" Train model """

# observations x: note, different dimension from Z (unlike sample code)
df_X = ts
x_obs = np.c_[df_X]
[n,d] = x_obs.shape
# create version of observations forward-filled
x_obs_nonna = x_obs.copy()
x_obs_nonna = pd.DataFrame(x_obs_nonna).ffill()[0].to_numpy().reshape(-1, 1)
#Objects to store predictions and filtering locations
Z = np.zeros((n,L))
Zpred = np.zeros((n,L))
Xpred = np.zeros((n,1))
Ct0 = x_obs_nonna[0:(L)].T
# fill forward if NAs at the start (edge case)
if np.isnan(Ct0).any():
    Ct0 = pandas_fill(Ct0)
Ct = Ct0

######## Train predictive model on historical data so far (will take some time)
timerange = range(L, n-1) # data starts at 0 so start analysis at L for L lags
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
    ### Get current observations xt, t
    # https://stats.stackexchange.com/questions/140990/using-kalman-filters-to-impute-missing-values-in-time-series
    x = np.array(x_obs[i])
    missing = any(np.isnan(x))
    ### Prediction step using previous data against new data ---------------------------------------
    # zhat, t|t-1
    z = At.dot(z)
    # Phat, t|t-1
    P = At.dot(P).dot(At.T) + Qt
    Zpred[i,:] = z.T
    xhat = Ct.dot(z)
#     print("{} -----------------------".format(i+1))
#     print(x_obs[(i-L):(i+1)].T) # all values thru i
#     print(">>> Ct: ", Ct)
#     print(">>> z.T: ", z.T)
#     print(Ct.dot(z)) # predicted value for i
    Xpred[i,:] = xhat
    
    ### Measurement update incorporating new data ---------------------------------------------------
    # handle missing
    if (missing):
        Z[i,:] = np.nan
        continue
    ### embed ARIMA within the Kalman filter
    Ct = x_obs_nonna[(i-L):(i)].T
    ### Calculate Kalman gain and update log-likehood
    # Kalman gain Kt
    S = Ct.dot(P).dot(Ct.T) + Rt
    Kt = P.dot(Ct.T).dot( np.linalg.inv(S))
    ### Measurement update step
    z = z + Kt.dot(x - Ct.dot(z))
    Z[i,:] = z.T
    # update P, t|t
    P = P - Kt.dot(Ct).dot(P)
    ### No backward smoothing - intended to work real-time
    
#     print()
#     if (i > 50):
#         raise Exception()

Training observation 10001 / 670869 ...
Training observation 20001 / 670869 ...
Training observation 30001 / 670869 ...
Training observation 40001 / 670869 ...
Training observation 50001 / 670869 ...
Training observation 60001 / 670869 ...
Training observation 70001 / 670869 ...
Training observation 80001 / 670869 ...
Training observation 90001 / 670869 ...
Training observation 100001 / 670869 ...
Training observation 110001 / 670869 ...
Training observation 120001 / 670869 ...
Training observation 130001 / 670869 ...
Training observation 140001 / 670869 ...
Training observation 150001 / 670869 ...
Training observation 160001 / 670869 ...
Training observation 170001 / 670869 ...
Training observation 180001 / 670869 ...
Training observation 190001 / 670869 ...
Training observation 200001 / 670869 ...
Training observation 210001 / 670869 ...
Training observation 220001 / 670869 ...
Training observation 230001 / 670869 ...
Training observation 240001 / 670869 ...
Training observation 2500

In [None]:
""" Model results """

# (because zero-indexed and need lags 0 ... L-1 for the first pred)
PLOT_TS_START = 0 # start from time L
PLOT_TS_END = 100

### Plot predictions
# t=timerange # start from when have enough lags
t = timerange[PLOT_TS_START:PLOT_TS_END]
df_plt = pd.DataFrame({
    "time": t,
    "raw": x_obs[t, 0],
    "pred": Xpred[t, 0]
#     "predicted": Zpred[timerange,0],
#     "filtered": Z[timerange, 0]
})
df_plt['diff'] = df_plt['raw'].shift() # just for debugging
df_plt['diff_error'] = df_plt['raw'] - df_plt['diff']
df_plt['error'] = (x_obs[t,0] - Xpred[t,0])
df_plt['error_sqr'] = (x_obs[t,0] - Xpred[t,0])**2
display(df_plt.head(25))
fig1 = px.line(df_plt, x="time", y=["raw", "pred"], template="plotly_dark")

### Plot prediction errors
fig2 = px.line(df_plt, x="time", y=["error"], template="plotly_dark")

### Plot diff residuals of actual series (for debugging)
### If really just learning the diff, then this should equal prediction error
fig2b = px.line(df_plt, x="time", y=["diff_error"], template="plotly_dark")

### Plot Kalman-estimated coefficient paths (coefs on lags)
df_coefs = pd.DataFrame(Zpred[t,])
df_coefs.columns = ["l{}".format(l) for l in range(1, df_coefs.shape[1]+1)]
df_coefs = df_coefs.reset_index()
fig3 = px.line(df_coefs, x="index",
               y=[c for c in df_coefs.columns if c != "index"], template="plotly_dark")

print("Error stats for {} lags: ".format(L))
print("MSE: {}\n\n".format(df_plt[['error_sqr']].mean()))
print(df_plt[['error']].describe())
display(fig1)
display(fig2)
# display(fig2b)
display(fig3)

So it does look like ARIMA-Kalman is basically just learning the difference of the time series. (I have verified that the values are not exactly the same.) To refine this, should do better tuning of the lags and perhaps of the $R_t, Q_t$ matrices.

CUSUM on randomly generated data

In [97]:
# """ Simple example of sequential CUSUM - working! """
# x = np.random.randn(300)/5
# x[100:200] += np.arange(0, 4, 4/100)

# """ Model setup """

# threshold = 1
# drift = 0
# gp, gn = np.zeros(x_obs.size), np.zeros(x_obs.size)
# ta, tai, taf = np.array([[], [], []], dtype=int)
# tap, tan = 0, 0
# amp = np.array([])

# """ Train sequentially """

# # timerange = range(L, n-1) # data starts at 0 so start analysis at L for L lags
# timerange = range(len(x))
# for i in timerange:
#     if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))

#     s = np.array(x[i]) - np.array(x[i-1])
#     gp[i] = gp[i-1] + s - drift  # cumulative sum for + change
#     gn[i] = gn[i-1] - s - drift  # cumulative sum for - change
#     if gp[i] < 0:
#         gp[i], tap = 0, i
#     if gn[i] < 0:
#         gn[i], tan = 0, i
#     if gp[i] > threshold or gn[i] > threshold:  # change detected!
#         ta = np.append(ta, i)    # alarm index
#         tai = np.append(tai, tap if gp[i] > threshold else tan)  # start
#         gp[i], gn[i] = 0, 0      # reset alarm
#     # Your modeling code here

# # plot
# df_plt = pd.DataFrame({
#     "t": timerange,
#     "x": x
# })
# df_plt = pd.merge(df_plt, pd.DataFrame({"t": ta, "break": 1}), on="t", how="left").fillna(0)
# fig = px.line(df_plt, x="t", y="x", template="plotly_dark")
# fig.add_scatter(x=df_plt['t'], y=df_plt['x'], mode="markers", marker_color=df_plt['break'])
# fig

CUSUM on the raw series

In [100]:

x = x_obs.flatten()

""" Model setup """

threshold = 1
drift = 0
gp, gn = np.zeros(x.size), np.zeros(x.size)
ta, tai, taf = np.array([[], [], []], dtype=int)
tap, tan = 0, 0
amp = np.array([])

""" Train sequentially """
# timerange = range(L, n-1) # data starts at 0 so start analysis at L for L lags
timerange = range(L, L + 90)
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
    s = np.array(x[i]) - np.array(x[i-1])
    gp[i] = gp[i-1] + s - drift  # cumulative sum for + change
    gn[i] = gn[i-1] - s - drift  # cumulative sum for - change
    if gp[i] < 0:
        gp[i], tap = 0, i
    if gn[i] < 0:
        gn[i], tan = 0, i
    if gp[i] > threshold or gn[i] > threshold:  # change detected!
        ta = np.append(ta, i)    # alarm index
        tai = np.append(tai, tap if gp[i] > threshold else tan)  # start
        gp[i], gn[i] = 0, 0      # reset alarm
    # Your modeling code here
    
""" Plot """
df_plt = pd.DataFrame({
    "t": timerange,
    "x": x[timerange]
})
df_plt = pd.merge(df_plt, pd.DataFrame({"t": ta, "break": 1}), on="t", how="left").fillna(0)
fig = px.line(df_plt, x="t", y="x", template="plotly_dark")
fig.add_scatter(x=df_plt['t'], y=df_plt['x'], mode="markers", marker_color=df_plt['break'])
fig

So clearly it should be better to do as high-quality prediction (regression) as possible then do CUSUM on the residuals. Focus on the prediction part first w/CUSUM, then do other change detection algos later if time.

### Model 2: Bayesian regression + CUSUM on the residuals

Sequential as usual.

In [98]:
x = x_obs.flatten()

""" Model setup """

### Online Bayesian linear regression with REG_L lags
REG_L = 10 # use last 10 seconds



### CUSUM parameters
threshold = 1
drift = 0
gp, gn = np.zeros(x.size), np.zeros(x.size)
ta, tai, taf = np.array([[], [], []], dtype=int)
tap, tan = 0, 0
amp = np.array([])

""" Train sequentially """
# timerange = range(L, n-1) # data starts at 0 so start analysis at L for L lags
timerange = range(L, L + 90)
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
    
    # Bayesian regression
    
    # CUSUM on the (online) residuals
    s = np.array(x[i]) - np.array(x[i-1])
    gp[i] = gp[i-1] + s - drift  # cumulative sum for + change
    gn[i] = gn[i-1] - s - drift  # cumulative sum for - change
    if gp[i] < 0:
        gp[i], tap = 0, i
    if gn[i] < 0:
        gn[i], tan = 0, i
    if gp[i] > threshold or gn[i] > threshold:  # change detected!
        ta = np.append(ta, i)    # alarm index
        tai = np.append(tai, tap if gp[i] > threshold else tan)  # start
        gp[i], gn[i] = 0, 0      # reset alarm
    # Your modeling code here
    
""" Plot """
df_plt = pd.DataFrame({
    "t": timerange,
    "x": x[timerange]
})
df_plt = pd.merge(df_plt, pd.DataFrame({"t": ta, "break": 1}), on="t", how="left").fillna(0)
fig = px.line(df_plt, x="t", y="x", template="plotly_dark")
fig.add_scatter(x=df_plt['t'], y=df_plt['x'], mode="markers", marker_color=df_plt['break'])
fig

### Model 3: Bayesian CUSUM

Directly adjusted.

- http://www.prodsyse.com/Bayes-Adj%20Cusum2.pdf
- https://cran.r-project.org/web/packages/spcadjust/vignettes/CUSUM_LinearRegression.html

In [248]:
""" Model setup """
import scipy as sc

REG_L = 3

#set up prior parameters
mu_0 = np.zeros(REG_L)
XtX = None
Om_0_inv = None
a_0 = 0.01
b_0 = 0.01
model_initialized = False

""" Train sequentially """
# collect data
Xall = []
Yall = []
timerange = range(REG_L, n-1) # data starts at 0 so start analysis at REG_L for REG_L lags
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
    x = np.array(x_obs[i])
    xl = np.array(x_obs[(i-REG_L):i])
    
    # collect enough data to set up the unit information prior
    if (i <= 100):
        Xall.append(np.array(x_obs[(i-REG_L):i])) # X matrix is only lags
        Yall.append(np.array(x_obs[i]))
        continue
    
    if not model_initialized:
        Y = np.concatenate(Yall)
        X = np.concatenate(Xall, axis=1).T
        Xt = X.T
        XtX = Xt.dot(X) / n
        Om_0_inv = XtX / n
        #calculate posterior parameters
        Om_n_inv = XtX + Om_0_inv
        Om_n = sc.linalg.inv(Om_n_inv)
        term1 = Om_0_inv.dot(mu_0)+X.T.dot(Y)
        mu_n = Om_n.dot(term1)
        a_n = a_0 + n/2
        term2 = Y.T.dot(Y)+mu_0.dot(Om_0_inv.dot(mu_0))+mu_n.dot(Om_n_inv.dot(mu_n))
        b_n = b_0 + term2/2
    
    # Now update for current data point
    
        
    raise Exception()
    
""" Plot """

# Plot code here

Exception: 

In [247]:
XtX / n

array([[0.00000094, 0.00000094, 0.00000093],
       [0.00000094, 0.00000094, 0.00000094],
       [0.00000093, 0.00000094, 0.00000094]])

In [None]:
""" Model setup """

# Parameters here

""" Train sequentially """
timerange = range(L, n-1) # data starts at 0 so start analysis at L for L lags
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
    x = np.array(x_obs[i])
    
    # Your modeling code here
    
""" Plot """

# Plot code here

# Presentation: stream and visualize music data with structural breaks

Steam and visualize music (+audio) in realtime, and tag with detected structural breaks (w/information - e.g. this was a structural break in pitch, velocity etc.) from the above models.

Note, the tagging itself will be on the 1-second level because we are training on the aggregated data with that granularity.

### (my notes)

- Task 1 (detecting structural breaks) is unsupervised learning - tagging structural breaks. Task 2 may use supervised learning.
- For Bayesian models, call R pkgs / methods from Python in order to save time. (No need to reinvent the wheel.)