# Task 1: predicting structural breaks in sequential music data

Uses sequential data + regression. 

Data:
- http://resources.mpi-inf.mpg.de/SMD/SMD_MIDI-Audio-Piano-Music.html

Potentially useful articles:
- Zhao, Kaiguang, et al. "Detecting change-point, trend, and seasonality in satellite time series data to track abrupt changes and nonlinear dynamics: A Bayesian ensemble algorithm." Remote sensing of Environment 232 (2019): 111181.
- Jiang, Yu, Zhe Song, and Andrew Kusiak. "Very short-term wind speed forecasting with Bayesian structural break model." Renewable energy 50 (2013): 637-647.
- Pesaran, M. Hashem, Davide Pettenuzzo, and Allan Timmermann. "Forecasting time series subject to multiple structural breaks." The Review of Economic Studies 73.4 (2006): 1057-1084.
- De Brouwer, Edward, et al. "Gru-ode-bayes: Continuous modeling of sporadically-observed time series." arXiv preprint arXiv:1905.12374 (2019).
- Thies, Sven, and Peter Molnár. "Bayesian change point analysis of Bitcoin returns." Finance Research Letters 27 (2018): 223-227.

In [142]:
from __future__ import division
from more_itertools import peekable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scp
import pickle
import magenta
import os, time, re, json, glob
%matplotlib inline
from IPython.core.display import display, HTML
### change width of notebook display
# display(HTML("<style>.container { width:70% !important; }</style>"))
from pathlib import Path
import ipdb;
def debug(): ipdb.set_trace() # debugging starts here
pd.set_option('display.float_format', lambda x: '%.4f' % x)
np.set_printoptions(suppress=True)

import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# for exposing API
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from flask import Flask
from flask_restful import Resource, Api

JUPYTER_PICKLE_FILE = "config/shared_jupyter_data.pkl"
def write_shared_jupyter(key, value, path=JUPYTER_PICKLE_FILE, overwrite=False):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
        if overwrite:
            shared_jupyter_data = {key: value}
        else:
            shared_jupyter_data[key] = value
    else:
        shared_jupyter_data = {key: value}
    with open(path, 'wb') as fp: 
        pickle.dump(shared_jupyter_data, fp)

def read_shared_jupyter(key=None, path=JUPYTER_PICKLE_FILE):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
            if key is not None:
                if key in shared_jupyter_data:
                    return(shared_jupyter_data[key])
                else:
                    print("Not found!")
                    return(None)
            else:
                return(shared_jupyter_data)
    else:
        print("No data")

def pandasToJson(df):
    return(df.to_json(orient="split"))
def jsonToPandas(json):
    return(pd.read_json(json, orient="split"))

pd.set_option('display.float_format', lambda x: "%.3f" % x)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

FIG_WIDTH = 1200
FIG_HEIGHT = 800

PITCH_MIN = 20
PITCH_MAX = 120
VELOCITY_MIN = 0
VELOCITY_MAX = 120

def hheader(x):
    print("#########################################")
    print("### {}".format(x))
    print("#########################################")

# Magenta dependencies:
# https://github.com/magenta/magenta

# Magenta uses pretty_midi to deal with midi files
import pretty_midi

In [2]:
### Set up overall folder for task 1
INPUT_FOLDER = "data/saarland/"
add_input_folder = lambda x: "{}/{}".format(INPUT_FOLDER, x).replace("//", "/")

OUTPUT_FOLDER = "saarland_results"
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)
OUTPUT_FOLDER = "{}/task1_sequential_learning".format(OUTPUT_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)
add_output_path = lambda x: "{}/{}".format(OUTPUT_FOLDER, x)

print(OUTPUT_FOLDER)

saarland_results/task1_sequential_learning


# Step 1 - Read in the Saarland dataset

In [3]:
RUN_STEP_1 = True

### Set up folder for step 1
STEP1_FOLDER = add_output_path("step1")
if not os.path.exists(STEP1_FOLDER):
    os.mkdir(STEP1_FOLDER)
add_step1_path = lambda x: "{}/{}".format(STEP1_FOLDER, x)

print(STEP1_FOLDER)

saarland_results/task1_sequential_learning/step1


In [4]:
if RUN_STEP_1:
    filenames = [str(path) for path in Path(INPUT_FOLDER).rglob("*.mid")]

    all_raw_dfs = []
    all_agg_dfs = []
    for cf, curr_file in enumerate(filenames):
        curr_file_fmt = curr_file.replace("\\", "/").replace(INPUT_FOLDER, "")
        curr_file_out_seed = curr_file_fmt.replace("/","__").replace(".midi", "")
        print("Processing file {} / {}".format(cf+1, len(filenames)))
        
        curr_midi = pretty_midi.PrettyMIDI(curr_file)
        solo_piano_part = curr_midi.instruments[0]
        df_notes = pd.DataFrame([(n.start, n.end, n.pitch, n.velocity, n.duration) for n in solo_piano_part.notes],
                                columns=['start', 'end', 'pitch', 'velocity', 'duration'])
        [composer, movement, pianist, date] = curr_file.replace("\\", "/").replace(INPUT_FOLDER, "").split("_")
        midi_filename = curr_file_fmt
        df_meta = pd.DataFrame([composer, movement, pianist, pd.to_datetime(date.replace("-SMD.mid", "")), midi_filename]).T
        df_meta.columns=["composer", "movement", "pianist", "date", "fp"]
        df_curr = pd.merge(df_notes, df_meta, how="cross")
        df_curr = df_curr.sort_values(by='start').reset_index(drop=True)
        # df_curr.to_csv(add_step1_path(curr_file_out_seed + "_raw_" + ".csv"))
        all_raw_dfs.append(df_curr)
        
        """ Time bar file + metadata """
        # Do not adjust start
        EVERY_N_SEC = 1
        df_curr_time = df_curr.copy()
        df_curr_time['start_sec'] =  (df_curr_time['start'] / EVERY_N_SEC).apply(np.floor).astype(int) * EVERY_N_SEC
        tmp = df_curr_time.select_dtypes(include=[np.number])
        df_curr_time.loc[:, tmp.columns] = np.round(tmp, decimals=5)
        def str_concat(x): return(','.join([str(s) for s in x]))
        df_curr_time_agg = df_curr_time.groupby(['start_sec'], as_index=False).agg({
            'start': [len, np.min, np.mean, np.median, np.max, str_concat],
            'end': [np.min, np.mean, np.median, np.max, str_concat],
            'pitch': [np.min, np.mean, np.median, np.max, str_concat],
            'velocity': [np.min, np.mean, np.median, np.max, str_concat],
            'duration': [np.min, np.mean, np.median, np.max, str_concat],
            # metadata features - same for all observations
            'composer': [pd.Series.mode],
            'movement': [pd.Series.mode],
            'pianist': [pd.Series.mode],
            'date': [pd.Series.mode],
            'fp': [pd.Series.mode]})
        df_curr_time_agg.columns = ['_'.join([cc for cc in c if len(cc) > 0])
                                    .replace("amin", "min").replace("amax", "max").replace("start_len", "n_notes")
                                    for c in list(df_curr_time_agg.columns)]
        df_curr_time_agg = pd.merge(df_curr_time_agg, # add missing times
                                # Always start from second 1
                                 pd.DataFrame(range(1, df_curr_time_agg.shape[0], 1), columns=["start_sec"]), on="start_sec",
                                how='right').sort_values(by="start_sec")
        # df_curr_time_agg.to_csv(add_step1_path(curr_file_out_seed + "_agg_" + ".csv"))
        all_agg_dfs.append(df_curr_time_agg)

Processing file 1 / 50
Processing file 2 / 50
Processing file 3 / 50
Processing file 4 / 50
Processing file 5 / 50
Processing file 6 / 50
Processing file 7 / 50
Processing file 8 / 50
Processing file 9 / 50
Processing file 10 / 50
Processing file 11 / 50
Processing file 12 / 50
Processing file 13 / 50
Processing file 14 / 50
Processing file 15 / 50
Processing file 16 / 50
Processing file 17 / 50
Processing file 18 / 50
Processing file 19 / 50
Processing file 20 / 50
Processing file 21 / 50
Processing file 22 / 50
Processing file 23 / 50
Processing file 24 / 50
Processing file 25 / 50
Processing file 26 / 50
Processing file 27 / 50
Processing file 28 / 50
Processing file 29 / 50
Processing file 30 / 50
Processing file 31 / 50
Processing file 32 / 50
Processing file 33 / 50
Processing file 34 / 50
Processing file 35 / 50
Processing file 36 / 50
Processing file 37 / 50
Processing file 38 / 50
Processing file 39 / 50
Processing file 40 / 50
Processing file 41 / 50
Processing file 42 / 50
P

In [5]:
### Write out big combined dataframes (<3 min)
fp_raw_combined = add_step1_path("maestro_raw.csv")
fp_agg_combined = add_step1_path("maestro_agg.csv")
if RUN_STEP_1:
    pd.concat([df.assign(piece_num=(dx+1)) for dx, df in enumerate(all_raw_dfs)]).to_csv(fp_raw_combined, index=False)
    pd.concat([df.assign(piece_num=(dx+1)) for dx, df in enumerate(all_agg_dfs)]).to_csv(fp_agg_combined, index=False)

# Step 2 - read in combined data

In [6]:
RUN_STEP_2 = True

### Set up folder for step 2
STEP2_FOLDER = add_output_path("step2")
if not os.path.exists(STEP2_FOLDER):
    os.mkdir(STEP2_FOLDER)
add_step2_path = lambda x: "{}/{}".format(STEP2_FOLDER, x)

print(STEP2_FOLDER)

saarland_results/task1_sequential_learning/step2


In [7]:
### <2 min
df_maestro_raw = pd.read_csv(fp_raw_combined)
df_maestro_agg = pd.read_csv(fp_agg_combined)

print(df_maestro_raw.shape)
display(df_maestro_raw.head())
print(df_maestro_agg.shape)
display(df_maestro_agg.head())
# all pieces start at least at start second 1 or later
# (df_maestro_agg.start_min - df_maestro_agg.start_sec).describe()

(151207, 11)


Unnamed: 0,start,end,pitch,velocity,duration,composer,movement,pianist,date,fp,piece_num
0,2.5396,3.0156,68,35,0.476,Bach,BWV849-01,1,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1
1,2.55,7.3021,49,11,4.7521,Bach,BWV849-01,1,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1
2,2.799,3.3594,66,44,0.5604,Bach,BWV849-01,1,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1
3,3.0521,3.3031,64,48,0.251,Bach,BWV849-01,1,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1
4,3.3177,3.6021,63,48,0.2844,Bach,BWV849-01,1,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1


(15517, 33)


Unnamed: 0,start_sec,n_notes,start_min,start_mean,start_median,start_max,start_str_concat,end_min,end_mean,end_median,end_max,end_str_concat,pitch_min,pitch_mean,pitch_median,pitch_max,pitch_str_concat,velocity_min,velocity_mean,velocity_median,velocity_max,velocity_str_concat,duration_min,duration_mean,duration_median,duration_max,duration_str_concat,composer_mode,movement_mode,pianist_mode,date_mode,fp_mode,piece_num
0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
1,2,3.0,2.5396,2.6295,2.55,2.799,"2.53958,2.55,2.79896",3.0156,4.559,3.3594,7.3021,"3.01562,7.30208,3.35938",49.0,61.0,66.0,68.0,684966.0,11.0,30.0,35.0,44.0,351144.0,0.476,1.9295,0.5604,4.7521,"0.47604,4.75208,0.56042",Bach,BWV849-01,1.0,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1
2,3,4.0,3.0521,3.4622,3.4589,3.8792,"3.05208,3.31771,3.6,3.87917",3.3031,3.7557,3.7094,4.301,"3.30312,3.60208,3.81667,4.30104",61.0,63.0,63.5,64.0,64636461.0,48.0,49.0,48.0,52.0,48484852.0,0.2167,0.2935,0.2677,0.4219,"0.25104,0.28437,0.21667,0.42188",Bach,BWV849-01,1.0,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1
3,4,3.0,4.2479,4.2878,4.2823,4.3333,"4.24792,4.28229,4.33333",4.8667,4.9649,4.901,5.1271,"4.90104,5.12708,4.86667",64.0,68.3333,68.0,73.0,646873.0,52.0,55.0,54.0,59.0,525459.0,0.5333,0.6771,0.6531,0.8448,"0.65313,0.84479,0.53333",Bach,BWV849-01,1.0,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1
4,5,3.0,5.2792,5.4872,5.5792,5.6031,"5.27917,5.57917,5.60312",5.3802,5.7632,5.9437,5.9656,"5.38021,5.94375,5.96562",66.0,68.6667,69.0,71.0,716966.0,31.0,40.0,44.0,45.0,454431.0,0.101,0.276,0.3625,0.3646,"0.10104,0.36458,0.3625",Bach,BWV849-01,1.0,2009-09-16,Bach_BWV849-01_001_20090916-SMD.mid,1


Raw is basically a dataframe of all the raw irregular time series (concatenated), while agg is basically a processed version that is by every 1 second, 2 second, etc. (still aligned with raw file timestamps, NOT realigned).

# Step 3 - modeling

In [48]:
### Set the time series to predict etc.
ts = df_maestro_agg['pitch_mean'].ravel()
N = ts.size

### Can always do multivariate, other time series, etc. later.
print(ts.shape)
ts[:10]

(15517,)


array([        nan, 61.        , 63.        , 68.33333333, 68.66666667,
       63.7       , 52.        , 60.        , 60.4       , 71.75      ])

### Model 1: Kalman filter + CUSUM for structural breaks

Source:
- Puhm, Martin, et al. "A Near Real-Time Method for Forest Change Detection Based on a Structural Time Series Model and the Kalman Filter." Remote Sensing 12.19 (2020): 3135.

In [146]:
""" Model setup """

import numpy as np
import bocd
import scipy as sc
from scipy import stats
from scipy.stats import multivariate_normal
from statsmodels.tsa.stattools import acovf
from scipy.ndimage.interpolation import shift

""" Kalman filter parameters """

REG_L = 14
LxL = (REG_L, REG_L)
Lx1 = (REG_L, 1)
Lx1_T = (1, REG_L)

# unobserved
zt = np.zeros(Lx1)
At = np.eye(REG_L)
wt = np.zeros(Lx1)
Rt = np.zeros(LxL)

# observed
xt = np.zeros((1,1))
Ct = np.zeros((1,REG_L)) # will hold the ARIMA lags
vt = np.zeros((1,1))
Qt = np.zeros((1,1))

### Store states and predictions
Zthat = [] # will be NxL
ZthatSize = (N, REG_L)
Xthat = [] # will be Nx1
XthatSize = (N, 1)
df_pred_vs_actual = []

### initialize until REG_L
zt = np.ones(Lx1) * (1/REG_L)
for j in range(REG_L):
    Zhat[j,:] = zt.reshape(-1,)
Rt = np.diag(np.random.gamma(1,0.1,size=REG_L)) # covar. mat is symmetric
Qt = np.random.gamma(3, 4, size=(1,1))
Pt = Rt

""" Train sequentially """
timerange = range(REG_L, N-1) # data starts at 0 so start analysis at L for L lags
timerange = range(REG_L, REG_L + 2000)
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
    # will predict x ~ z basically
    xt = ts[i].reshape(xt.shape)
    Ct = np.array(ts[(i-REG_L):i]).reshape(Ct.shape)
    
    # For unobserved (covariates), impute mean if lags are NaN
    # If not possible then put NA and continue
    if np.isnan([Ct]).any():
        # impute mean if possible, if not then do NA
        Ct_nonna = Ct[~np.isnan(Ct)]
        if len(Ct_nonna) < 1:
            df_pred_vs_actual.append([i, y, np.nan])
            continue
        else:
            Ct_impute = np.copy(Ct)
            Ct_impute[np.isnan(Ct_impute)] = np.nanmean(Ct_impute)
            Ct = Ct_impute

    # For missing outcome xt, update and skip
    if (np.isnan([xt]).any()):
        df_pred_vs_actual.append([i, xt, np.nan]) # NAN prediction
        continue

    """ Run Kalman filter """
    
    ### Prediction
    zt_hat = At.dot(zt)
    Pt = At.dot(Pt).dot(At.T) + Rt
    xt_hat = Ct.dot(zt_hat)
    
    # compare against ground truth
    residual = xt - xt_hat
    
    ### Update
    Kt = Pt.dot(Ct.T).dot(sc.linalg.inv(Ct.dot(Pt).dot(Ct.T) + Qt))
    zt_hat = zt_hat + Kt.dot(xt - Ct.dot(zt_hat))
    Pt = Pt - Kt.dot(Ct).dot(Pt)
    
    ### Store predictions
    Zthat.append(zt_hat)
    Xthat.append(xt_hat)
    df_pred_vs_actual.append([i, float(xt.item()), float(xt_hat.item())])
    
    """ Change point detection """

""" Plot """
df_plt = pd.DataFrame(df_pred_vs_actual)
df_plt.columns = ['t', 'actual', 'pred']
df_plt['actual'] = df_plt['actual'].astype(float)
df_plt['resid'] = df_plt['actual'] - df_plt['pred']
fig = px.line(df_plt, x="t", y=['actual', 'pred'], template="plotly_dark")
# fig.add_scatter(x=df_plt['t'], y=df_plt['pred'], mode="markers",
#                 marker_size=df_plt['break']*7, # because binary
#                 marker_color=df_plt['break'])
fig

### Model 2: Bayesian linear regression + CUSUM

Directly adjusted.

- http://www.prodsyse.com/Bayes-Adj%20Cusum2.pdf
- https://cran.r-project.org/web/packages/spcadjust/vignettes/CUSUM_LinearRegression.html
- https://github.com/BMClab/BMC/blob/master/functions/detect_cusum.py

In [None]:
""" Model setup """

import numpy as np
from scipy import stats

# adapted from https://maxhalford.github.io/blog/bayesian-linear-regression/
# + computer class code
# n_features = REG_L
# alpha = 0.1 
# beta = 1
# mean = np.zeros(n_features)
# cov_inv = np.identity(n_features) / alpha

class BayesLinReg:
    def __init__(self, n_features, alpha, beta):
        self.n_features = n_features
        self.alpha = alpha
        self.beta = beta
        self.mean = np.zeros(n_features)
        self.cov_inv = np.identity(n_features) / alpha
    def learn(self, x, y):
        # Update the inverse covariance matrix (Bishop eq. 3.51)
        cov_inv = self.cov_inv + self.beta * np.outer(x, x)
        # Update the mean vector (Bishop eq. 3.50)
        cov = np.linalg.inv(cov_inv)
        mean = cov @ (self.cov_inv @ self.mean + self.beta * y * x)
        self.cov_inv = cov_inv
        self.mean = mean
        return self
    def predict(self, x):
        # Obtain the predictive mean (Bishop eq. 3.58)
        y_pred_mean = x @ self.mean
        # Obtain the predictive variance (Bishop eq. 3.59)
        w_cov = np.linalg.inv(self.cov_inv)
        y_pred_var = 1 / self.beta + x @ w_cov @ x.T
        return stats.norm(loc=y_pred_mean, scale=y_pred_var ** .5)
    @property
    def weights_dist(self):
        cov = np.linalg.inv(self.cov_inv)
        return stats.multivariate_normal(mean=self.mean, cov=cov)

df_X = ts
x_obs = np.c_[df_X]

# Parameters here
REG_L = 15
blr = BayesLinReg(n_features=REG_L, alpha=0.1, beta=1)
df_pred_vs_actual = []
residuals = np.zeros(x_obs.shape)

### CUSUM parameters
threshold = 15 # increase for more strict
drift = 0
gp, gn = np.zeros(x_obs.size), np.zeros(x_obs.size)
ta, tai, taf = np.array([[], [], []], dtype=int)
tap, tan = 0, 0
amp = np.array([])

""" Train sequentially """
timerange = range(REG_L, x_obs.size-1) # data starts at 0 so start analysis at L for L lags
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
#     y = np.array(x_obs[i])
    y = x_obs[i].tolist()[0]
    x = np.array(x_obs[(i-REG_L):i]).reshape(-1,)
    
    # impute mean if lags are NaN, if not possible then put NA and continue
    if np.isnan([x]).any():
        # impute mean if possible, if not then do NA
        x_nonna = x[~np.isnan(x)]
        if len(x_nonna) < 1:
            df_pred_vs_actual.append([i, y, np.nan])
            continue
        else:
            x_impute = np.copy(x)
            x_impute[np.isnan(x_impute)] = np.nanmean(x_impute)
            x = x_impute
    
    # skip training if output is NA
    if (np.isnan([y]).any()):
        continue

    if (i <= 100):
        # just train
        blr.learn(x, y)
        continue

    yhat = blr.predict(x).mean()
    if (np.isnan(yhat).any()):
        break
    df_pred_vs_actual.append([i, y, yhat])
    blr.learn(x, y)
    residuals[i] = y - yhat
    
    """ CUSUM """

    # CUSUM on the (online) residuals
    # (only run if already have 1 or more residuals calculated)
    r = residuals
    if (r[i-1] == 0):
        continue
    
    # Auto-estimate drift to be (1/2)*expected change
    # aka mean of the last REG_LAG residuals.
    drift = np.mean(residuals[(i-REG_L):i]) / 2
    
    # Auto-estimate threshold to be 
    
    s = np.array(r[i]) - np.array(r[i-1])
    gp[i] = gp[i-1] + s - drift  # cumulative sum for + change
    gn[i] = gn[i-1] - s - drift  # cumulative sum for - change
    if gp[i] < 0:
        gp[i], tap = 0, i
    if gn[i] < 0:
        gn[i], tan = 0, i
    if gp[i] > threshold or gn[i] > threshold:  # change detected!
        ta = np.append(ta, i)    # alarm index
        tai = np.append(tai, tap if gp[i] > threshold else tan)  # start
        gp[i], gn[i] = 0, 0      # reset alarm
    
    if (i > 400):
        break

""" Plot """
df_plt = pd.DataFrame(df_pred_vs_actual)
df_plt.columns = ['t', 'actual', 'pred']
df_plt['resid'] = df_plt['actual'] - df_plt['pred']
df_plt = pd.merge(df_plt, pd.DataFrame({"t": ta, "break": 1}), on="t", how="left").fillna(0)
fig = px.line(df_plt, x="t", y=['actual', 'pred'], template="plotly_dark")
fig.add_scatter(x=df_plt['t'], y=df_plt['pred'], mode="markers",
                marker_size=df_plt['break']*7, # because binary
                marker_color=df_plt['break'])
display(fig)

# # residuals plot
# fig2 = px.line(df_plt, x='t', y=['resid'], template='plotly_dark')
# fig2.add_scatter(x=df_plt['t'], y=df_plt['resid'], mode="markers", marker_color=df_plt['break'])
# display(fig2)

### Model 3: Bayesian linear regression + online changepoint detection

https://arxiv.org/pdf/0710.3742.pdf

In [None]:
""" Model setup """

import numpy as np
import bocd
from scipy import stats

# adapted from https://maxhalford.github.io/blog/bayesian-linear-regression/
# + computer class code
# n_features = REG_L
# alpha = 0.1 
# beta = 1
# mean = np.zeros(n_features)
# cov_inv = np.identity(n_features) / alpha

class BayesLinReg:
    def __init__(self, n_features, alpha, beta):
        self.n_features = n_features
        self.alpha = alpha
        self.beta = beta
        self.mean = np.zeros(n_features)
        self.cov_inv = np.identity(n_features) / alpha
    def learn(self, x, y):
        # Update the inverse covariance matrix (Bishop eq. 3.51)
        cov_inv = self.cov_inv + self.beta * np.outer(x, x)
        # Update the mean vector (Bishop eq. 3.50)
        cov = np.linalg.inv(cov_inv)
        mean = cov @ (self.cov_inv @ self.mean + self.beta * y * x)
        self.cov_inv = cov_inv
        self.mean = mean
        return self
    def predict(self, x):
        # Obtain the predictive mean (Bishop eq. 3.58)
        y_pred_mean = x @ self.mean
        # Obtain the predictive variance (Bishop eq. 3.59)
        w_cov = np.linalg.inv(self.cov_inv)
        y_pred_var = 1 / self.beta + x @ w_cov @ x.T
        return stats.norm(loc=y_pred_mean, scale=y_pred_var ** .5)
    @property
    def weights_dist(self):
        cov = np.linalg.inv(self.cov_inv)
        return stats.multivariate_normal(mean=self.mean, cov=cov)

df_X = ts
x_obs = np.c_[df_X]

# Parameters here
df_X = ts
x_obs = np.c_[df_X]

REG_L = 15
blr = BayesLinReg(n_features=REG_L, alpha=0.1, beta=1)
df_pred_vs_actual = []
residuals = np.zeros(x_obs.shape)

### BOCD 
bc = bocd.BayesianOnlineChangePointDetection(bocd.ConstantHazard(300), bocd.StudentT(mu=0, kappa=1, alpha=1, beta=1))
rt_mle = np.empty(x_obs.shape)

""" Train sequentially """
timerange = range(REG_L, x_obs.size-1) # data starts at 0 so start analysis at L for L lags
timerange = range(REG_L, REG_L + 400)
for i in timerange:
    if (i % 10000 == 0): print("Training observation {} / {} ...".format(i+1, n-1))
    y = x_obs[i].tolist()[0]
    x = np.array(x_obs[(i-REG_L):i]).reshape(-1,)
    
    # impute mean if lags are NaN, if not possible then put NA and continue
    if np.isnan([x]).any():
        # impute mean if possible, if not then do NA
        x_nonna = x[~np.isnan(x)]
        if len(x_nonna) < 1:
            df_pred_vs_actual.append([i, y, np.nan])
            continue
        else:
            x_impute = np.copy(x)
            x_impute[np.isnan(x_impute)] = np.nanmean(x_impute)
            x = x_impute
    
    # use last non-NA value if outcome is NA
    if (np.isnan([y]).any()):
        # use past value that is not NA
        for j in range(i-1, 0, -1):
            if not (np.isnan(x_obs[j]).any()):
                y = x_obs[j]
                break

    if (i <= 100):
        # just train
        blr.learn(x, y)
        continue

    yhat = blr.predict(x).mean()
    if (np.isnan(yhat).any()):
        break
    df_pred_vs_actual.append([i, y, yhat])
    blr.learn(x, y)
    residuals[i] = y - yhat
    
    """ Change point detection """
    
    r = residuals
    
    bc.update(r[i])
    rt_mle[i] = bc.rt

""" Plot """
df_plt = pd.DataFrame(df_pred_vs_actual)
df_plt.columns = ['t', 'actual', 'pred']
df_plt['actual'] = df_plt['actual'].astype(float)
df_plt['resid'] = df_plt['actual'] - df_plt['pred']
# add change points
index_changes = np.where(np.diff(rt_mle.flatten())<0)[0]
df_plt = pd.merge(df_plt, pd.DataFrame({"t": index_changes, "break": 1}), on="t", how="left").fillna(0)

fig = px.line(df_plt, x="t", y=['actual', 'pred'], template="plotly_dark")
fig.add_scatter(x=df_plt['t'], y=df_plt['pred'], mode="markers",
                marker_size=df_plt['break']*7, # because binary
                marker_color=df_plt['break'])

# Presentation: stream and visualize music data with structural breaks

Steam and visualize music (+audio) in realtime, and tag with detected structural breaks (w/information - e.g. this was a structural break in pitch, velocity etc.) from the above models.

Note, the tagging itself will be on the 1-second level because we are training on the aggregated data with that granularity.

Report table with both (1) predictive MSE for the prediction model, and (2) accuracy of structural breaks, compared to my listening (if time).

### (my notes)

TODO list:
- fix Kalman filter
- better adapt the Bayesian linear regression (don't copy and paste)
- train many more models (packages, don't implement - could be Python or rpy2)
- work on music classification

In [None]:
df_maestro_agg.head()