# Task 1: predicting structural breaks in sequential Google Maestro data

Uses sequential data + regression. 

Potentially useful articles:
- Zhao, Kaiguang, et al. "Detecting change-point, trend, and seasonality in satellite time series data to track abrupt changes and nonlinear dynamics: A Bayesian ensemble algorithm." Remote sensing of Environment 232 (2019): 111181.
- Jiang, Yu, Zhe Song, and Andrew Kusiak. "Very short-term wind speed forecasting with Bayesian structural break model." Renewable energy 50 (2013): 637-647.
- Pesaran, M. Hashem, Davide Pettenuzzo, and Allan Timmermann. "Forecasting time series subject to multiple structural breaks." The Review of Economic Studies 73.4 (2006): 1057-1084.
- De Brouwer, Edward, et al. "Gru-ode-bayes: Continuous modeling of sporadically-observed time series." arXiv preprint arXiv:1905.12374 (2019).

In [1]:
from __future__ import division
from more_itertools import peekable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scp
import pickle
import magenta
import os, time, re, json, glob
%matplotlib inline
from IPython.core.display import display, HTML
### change width of notebook display
# display(HTML("<style>.container { width:70% !important; }</style>"))
from pathlib import Path
    
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# for exposing API
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from flask import Flask
from flask_restful import Resource, Api

JUPYTER_PICKLE_FILE = "config/shared_jupyter_data.pkl"
def write_shared_jupyter(key, value, path=JUPYTER_PICKLE_FILE, overwrite=False):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
        if overwrite:
            shared_jupyter_data = {key: value}
        else:
            shared_jupyter_data[key] = value
    else:
        shared_jupyter_data = {key: value}
    with open(path, 'wb') as fp: 
        pickle.dump(shared_jupyter_data, fp)

def read_shared_jupyter(key=None, path=JUPYTER_PICKLE_FILE):
    if (os.path.exists(path)):
        with open(path, "rb") as fp:
            shared_jupyter_data = pickle.load(fp)
            if key is not None:
                if key in shared_jupyter_data:
                    return(shared_jupyter_data[key])
                else:
                    print("Not found!")
                    return(None)
            else:
                return(shared_jupyter_data)
    else:
        print("No data")

def pandasToJson(df):
    return(df.to_json(orient="split"))
def jsonToPandas(json):
    return(pd.read_json(json, orient="split"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

FIG_WIDTH = 1200
FIG_HEIGHT = 800

PITCH_MIN = 20
PITCH_MAX = 120
VELOCITY_MIN = 0
VELOCITY_MAX = 120

def hheader(x):
    print("#########################################")
    print("### {}".format(x))
    print("#########################################")

# Magenta dependencies:
# https://github.com/magenta/magenta

# Magenta uses pretty_midi to deal with midi files
import pretty_midi

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


In [2]:
### Set up overall folder for task 1
INPUT_FOLDER = "data/maestro/maestro-v3.0.0/"
add_input_folder = lambda x: "{}/{}".format(INPUT_FOLDER, x).replace("//", "/")

OUTPUT_FOLDER = "task1_sequential_learning"
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)
add_output_path = lambda x: "{}/{}".format(OUTPUT_FOLDER, x)

print(OUTPUT_FOLDER)

task1_sequential_learning


# Step 1 - Read in the Google Maestro data (MIDI)

In [7]:
RUN_STEP_1 = False

In [3]:
if RUN_STEP_1:
    ### Set up folder for step 1
    STEP1_FOLDER = add_output_path("step1")
    if not os.path.exists(STEP1_FOLDER):
        os.mkdir(STEP1_FOLDER)
    add_step1_path = lambda x: "{}/{}".format(STEP1_FOLDER, x)

    print(STEP1_FOLDER)

task1_sequential_learning/step1


In [5]:
if RUN_STEP_1:
    ### read in MIDI metadata
    midi_metadata = pd.read_csv(add_input_folder("maestro-v3.0.0.csv"))
    midi_metadata.head()

    ### read in MIDI files
    filenames = [str(path) for path in Path('data/').rglob('*.midi')]

    all_raw_dfs = []
    all_agg_dfs = []
    for cf, curr_file in enumerate(filenames):
        curr_file_fmt = curr_file.replace("\\", "/").replace(INPUT_FOLDER, "")
        curr_file_out_seed = curr_file_fmt.replace("/","__").replace(".midi", "")
        print("Processing file {} / {}".format(cf+1, len(filenames)))

        """ Raw MIDI file + metadata """
        curr_midi = pretty_midi.PrettyMIDI(curr_file)
        solo_piano_part = curr_midi.instruments[0]
        df_notes = pd.DataFrame([(n.start, n.end, n.pitch, n.velocity, n.duration) for n in solo_piano_part.notes],
                                columns=['start', 'end', 'pitch', 'velocity', 'duration'])
        df_metadata = midi_metadata[midi_metadata['midi_filename']==curr_file_fmt]
        df_metadata = df_metadata.rename(columns={"duration": "total_duration"})
        df_metadata.drop(columns="audio_filename", inplace=True) # don't need audio for now
        curr_filename = df_metadata['midi_filename'].values[0].replace("/", "__")
        df_curr = pd.merge(df_notes, df_metadata, how="cross")
        df_curr = df_curr.sort_values(by='start')
        df_curr.to_csv(add_step1_path(curr_file_out_seed + "_raw_" + ".csv"))
        all_raw_dfs.append(df_curr)

        """ Time bar file + metadata """
        # Do not adjust start
        EVERY_N_SEC = 1
        df_curr_time = df_curr.copy()
        df_curr_time['start_sec'] =  (df_curr_time['start'] / EVERY_N_SEC).apply(np.floor).astype(int) * EVERY_N_SEC
        tmp = df_curr_time.select_dtypes(include=[np.number])
        df_curr_time.loc[:, tmp.columns] = np.round(tmp, decimals=5)
        def str_concat(x): return(','.join([str(s) for s in x]))
        df_curr_time_agg = df_curr_time.groupby(['start_sec'], as_index=False).agg({
            'start': [len, np.min, np.mean, np.median, np.max, str_concat],
            'end': [np.min, np.mean, np.median, np.max, str_concat],
            'pitch': [np.min, np.mean, np.median, np.max, str_concat],
            'velocity': [np.min, np.mean, np.median, np.max, str_concat],
            'duration': [np.min, np.mean, np.median, np.max, str_concat],
            # metadata features - same for all observations
            'canonical_composer': [pd.Series.mode],
            'split': [pd.Series.mode],
            'year': [pd.Series.mode],
            'total_duration': [pd.Series.mode] })
        df_curr_time_agg.columns = ['_'.join([cc for cc in c if len(cc) > 0])
                                    .replace("amin", "min").replace("amax", "max").replace("start_len", "n_notes")
                                    for c in list(df_curr_time_agg.columns)]
        df_curr_time_agg = pd.merge(df_curr_time_agg, # add missing times
                                 pd.DataFrame(range(1, df_curr_time_agg.shape[0], 1), columns=["start_sec"]), on="start_sec",
                                how='right').sort_values(by="start_sec")
        df_curr_time_agg.to_csv(add_step1_path(curr_file_out_seed + "_agg_" + ".csv"))
        all_agg_dfs.append(df_curr_time_agg)

Processing file 1 / 1276
Processing file 2 / 1276
Processing file 3 / 1276
Processing file 4 / 1276
Processing file 5 / 1276
Processing file 6 / 1276
Processing file 7 / 1276
Processing file 8 / 1276
Processing file 9 / 1276
Processing file 10 / 1276
Processing file 11 / 1276
Processing file 12 / 1276
Processing file 13 / 1276
Processing file 14 / 1276
Processing file 15 / 1276
Processing file 16 / 1276
Processing file 17 / 1276
Processing file 18 / 1276
Processing file 19 / 1276
Processing file 20 / 1276
Processing file 21 / 1276
Processing file 22 / 1276
Processing file 23 / 1276
Processing file 24 / 1276
Processing file 25 / 1276
Processing file 26 / 1276
Processing file 27 / 1276
Processing file 28 / 1276
Processing file 29 / 1276
Processing file 30 / 1276
Processing file 31 / 1276
Processing file 32 / 1276
Processing file 33 / 1276
Processing file 34 / 1276
Processing file 35 / 1276
Processing file 36 / 1276
Processing file 37 / 1276
Processing file 38 / 1276
Processing file 39 / 

Processing file 309 / 1276
Processing file 310 / 1276
Processing file 311 / 1276
Processing file 312 / 1276
Processing file 313 / 1276
Processing file 314 / 1276
Processing file 315 / 1276
Processing file 316 / 1276
Processing file 317 / 1276
Processing file 318 / 1276
Processing file 319 / 1276
Processing file 320 / 1276
Processing file 321 / 1276
Processing file 322 / 1276
Processing file 323 / 1276
Processing file 324 / 1276
Processing file 325 / 1276
Processing file 326 / 1276
Processing file 327 / 1276
Processing file 328 / 1276
Processing file 329 / 1276
Processing file 330 / 1276
Processing file 331 / 1276
Processing file 332 / 1276
Processing file 333 / 1276
Processing file 334 / 1276
Processing file 335 / 1276
Processing file 336 / 1276
Processing file 337 / 1276
Processing file 338 / 1276
Processing file 339 / 1276
Processing file 340 / 1276
Processing file 341 / 1276
Processing file 342 / 1276
Processing file 343 / 1276
Processing file 344 / 1276
Processing file 345 / 1276
P

Processing file 613 / 1276
Processing file 614 / 1276
Processing file 615 / 1276
Processing file 616 / 1276
Processing file 617 / 1276
Processing file 618 / 1276
Processing file 619 / 1276
Processing file 620 / 1276
Processing file 621 / 1276
Processing file 622 / 1276
Processing file 623 / 1276
Processing file 624 / 1276
Processing file 625 / 1276
Processing file 626 / 1276
Processing file 627 / 1276
Processing file 628 / 1276
Processing file 629 / 1276
Processing file 630 / 1276
Processing file 631 / 1276
Processing file 632 / 1276
Processing file 633 / 1276
Processing file 634 / 1276
Processing file 635 / 1276
Processing file 636 / 1276
Processing file 637 / 1276
Processing file 638 / 1276
Processing file 639 / 1276
Processing file 640 / 1276
Processing file 641 / 1276
Processing file 642 / 1276
Processing file 643 / 1276
Processing file 644 / 1276
Processing file 645 / 1276
Processing file 646 / 1276
Processing file 647 / 1276
Processing file 648 / 1276
Processing file 649 / 1276
P

Processing file 917 / 1276
Processing file 918 / 1276
Processing file 919 / 1276
Processing file 920 / 1276
Processing file 921 / 1276
Processing file 922 / 1276
Processing file 923 / 1276
Processing file 924 / 1276
Processing file 925 / 1276
Processing file 926 / 1276
Processing file 927 / 1276
Processing file 928 / 1276
Processing file 929 / 1276
Processing file 930 / 1276
Processing file 931 / 1276
Processing file 932 / 1276
Processing file 933 / 1276
Processing file 934 / 1276
Processing file 935 / 1276
Processing file 936 / 1276
Processing file 937 / 1276
Processing file 938 / 1276
Processing file 939 / 1276
Processing file 940 / 1276
Processing file 941 / 1276
Processing file 942 / 1276
Processing file 943 / 1276
Processing file 944 / 1276
Processing file 945 / 1276
Processing file 946 / 1276
Processing file 947 / 1276
Processing file 948 / 1276
Processing file 949 / 1276
Processing file 950 / 1276
Processing file 951 / 1276
Processing file 952 / 1276
Processing file 953 / 1276
P

Processing file 1213 / 1276
Processing file 1214 / 1276
Processing file 1215 / 1276
Processing file 1216 / 1276
Processing file 1217 / 1276
Processing file 1218 / 1276
Processing file 1219 / 1276
Processing file 1220 / 1276
Processing file 1221 / 1276
Processing file 1222 / 1276
Processing file 1223 / 1276
Processing file 1224 / 1276
Processing file 1225 / 1276
Processing file 1226 / 1276
Processing file 1227 / 1276
Processing file 1228 / 1276
Processing file 1229 / 1276
Processing file 1230 / 1276
Processing file 1231 / 1276
Processing file 1232 / 1276
Processing file 1233 / 1276
Processing file 1234 / 1276
Processing file 1235 / 1276
Processing file 1236 / 1276
Processing file 1237 / 1276
Processing file 1238 / 1276
Processing file 1239 / 1276
Processing file 1240 / 1276
Processing file 1241 / 1276
Processing file 1242 / 1276
Processing file 1243 / 1276
Processing file 1244 / 1276
Processing file 1245 / 1276
Processing file 1246 / 1276
Processing file 1247 / 1276
Processing file 1248

In [6]:
### join MIDI files with the metadata and write out to file