In [100]:
import os, re
import pandas as pd
pd.set_option("display.max_rows", 500)

# name for analysis running
ANALYSIS_NAME = "core1"

""" 
YOUR PARAMETERS - CHANGE THESE!

STEP1:
Put your MIDI files into the folder final_datasets/

STEP2:
Make sure your MIDI files have standardized naming, specifically, have
a common substring in the filename <MUSIC_FILE_REGEX>.
*If you include structural break tagging, add your tagging MIDI file
as <original_filename>_TAGGED_<time_series_name>.mid.
Example for MUSIC_FILE_REGEX="kapustin_variations":
    - kapustin_variations.mid:
        original MIDI file
    - kapustin_variations.wav:
        WAV file to accompany original MIDI
    - kapustin_variations_TAGGED_pitch_mean.mid:
        structural break tagging information for kapustin_variations.mid
    - kapustin_variations_TAGGED_pitch_mean.wav:
        WAV file to accompany tagging information
        (= original + tagging audio)

STEP3:
Change parameters here. Again, note that the string <MUSIC_FILE_REGEX>
will be used to filter for all MIDI files to analyze.
- corpus: name for corpus (only for output naming)
- music file regex: for searching pieces in final_datasets
- time series variable: time series to analyze (see available below)
"""

CORPUS = "saarland"
# IMPORTANT: should at most match 1 MIDI file in final_datasets/
MUSIC_FILE_REGEX = "Bach_BWV849-01"
TIME_SERIES_VARIABLE = "pitch_mean"

### other metadata
PIECE_NAME = "{}".format(MUSIC_FILE_REGEX)
PIANIST = "Saarland"
DATE = "May 1, 2021"

""""""

ANALYSIS_KEY = '__'.join([ANALYSIS_NAME, CORPUS, MUSIC_FILE_REGEX, TIME_SERIES_VARIABLE])
TIME_SERIES_VARIABLE_FMT = TIME_SERIES_VARIABLE.replace("_", " ").title()

### All structural break tagging MIDI fileswill have this in the filename
### For instance, "kapustin_variations_BEAT.midi" will be a structural break tagging file
### for the core music file "kapustin_variations.midi"
STRUCTURAL_BREAK_REGEX = "TAGGED"

### Use only the first filename that shows up. This notebook is only
### designed to find and run on 1 piece.
USE_ONLY_FIRST_MIDI_STREAM = True

### Validate time series variable:
### Should be one of the following
time_series_variables = ['n_notes',
 'pitch_min', 'pitch_mean', 'pitch_median', 'pitch_max',
 'velocity_min', 'velocity_mean', 'velocity_median', 'velocity_max',
 'duration_min', 'duration_mean', 'duration_median', 'duration_max']
if TIME_SERIES_VARIABLE not in time_series_variables:
    raise Exception("Time series variable not found.")

### Set up overall folder for task 1
### Analysis will use and concatenate all the MIDI tracks of interest in this folder,
### where tracks of interest have MIDI_STREAM_REGEX in filename.
INPUT_FOLDER = "final_datasets"
MIDI_STREAM_REGEX = MUSIC_FILE_REGEX
add_input_folder = lambda x: "{}/{}".format(INPUT_FOLDER, x).replace("//", "/")
print(INPUT_FOLDER)

# ### Output will be stored in folder (and subfolders of):
# ### results/<ANALYSIS_NAME>/<CORPUS>/<MUSIC_FILE_REGEX>/<TIME_SERIES_VARIABLE>
# OUTPUT_FOLDER = "results"
# if not os.path.exists(OUTPUT_FOLDER):
#     os.mkdir(OUTPUT_FOLDER)
# for v in [ANALYSIS_NAME, CORPUS, MUSIC_FILE_REGEX, TIME_SERIES_VARIABLE]:
#     OUTPUT_FOLDER = "{}/{}".format(OUTPUT_FOLDER, v)
#     if not os.path.exists(OUTPUT_FOLDER):
#         os.mkdir(OUTPUT_FOLDER)
# add_output_path = lambda x: "{}/{}".format(OUTPUT_FOLDER, x)
# print(OUTPUT_FOLDER)

final_datasets


### Read in all summary and error files

In [125]:
""" Summary of error """
import fnmatch
import os
import numpy as np

matches = []
for root, dirnames, filenames in os.walk('results'):
    for filename in fnmatch.filter(filenames, 'summary_error.csv'):
        matches.append(os.path.join(root, filename))
print("\n".join(matches))

all_summary_error = []
for match in matches:
    df_curr = pd.read_csv(match, header=[0,1], skiprows=1)
    df_curr.columns = ["model_no", "reg_model", "change_model", "n", "MSE", "RMSE", "MAE", "MAPE", "analysis_key"]
    df_info = pd.DataFrame(df_curr.analysis_key.str.split("__", expand=True))
    df_info2 = pd.DataFrame(df_info[2].str.split("_", expand=True)).loc[:,[0,1]]
    df_curr = pd.concat([df_curr,df_info, df_info2], axis=1)
    df_curr.columns = (["model_no", "reg_model", "change_model", "n", "MSE", "RMSE", "MAE", "MAPE", "analysis_key"] +
        ['key', 'corpus', 'work', 'series', 'composer', 'piece'])
    df_curr = df_curr[["model_no", "reg_model", "change_model", "MSE", "RMSE", "MAE", "MAPE",
                      "composer", "piece", "series"]]
    all_summary_error.append(df_curr)

df_all_summary_error = pd.concat(all_summary_error).round(3)
df_all_summary_error = df_all_summary_error[['composer', 'piece', 'reg_model', 'change_model',
                            'MSE', 'RMSE', 'MAE', 'MAPE']]
# agg
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_
df_all_summary_error_agg = df_all_summary_error.groupby((['reg_model', 'change_model'])).agg({
    "MSE": [np.mean, np.median],
    "RMSE": [np.mean, np.median],
    "MAE": [np.mean, np.median],
    "MAPE": [np.mean, np.median]
}).T.reset_index().round(3)
df_all_summary_error_agg.columns = [" ".join(c).replace("level_0", "metric").replace("level_1", "agg")
                                    for c in df_all_summary_error_agg.columns]
display(df_all_summary_error_agg)
print(df_all_summary_error_agg.shape)
print(df_all_summary_error_agg.to_latex(index=False))

display(df_all_summary_error)
print(df_all_summary_error.shape)
print(df_all_summary_error.to_latex(index=False))

results\core1\chow\kapustin_variations\pitch_mean\step3\summary_error.csv
results\core1\saarland\Bach_BWV849-01\pitch_mean\step3\summary_error.csv
results\core1\saarland\Beethoven_Op027No1-01\pitch_mean\step3\summary_error.csv
results\core1\saarland\Chopin_Op010-03\pitch_mean\step3\summary_error.csv
results\core1\saarland\Haydn_HobXVINo52-01\pitch_mean\step3\summary_error.csv
results\core1\saarland\Liszt_AnnesDePelerinage-LectureDante\pitch_mean\step3\summary_error.csv
results\core1\saarland\Mozart_KV265_006\pitch_mean\step3\summary_error.csv
results\core1\saarland\Rachmaninoff_Op036-01\pitch_mean\step3\summary_error.csv
results\core1\saarland\Ravel_JeuxDEau_008\pitch_mean\step3\summary_error.csv


Unnamed: 0,metric,agg,Bayesian LR BOCD,Bayesian LR CUSUM,Gaussian process regression BOCD,Gaussian process regression CUSUM,Kalman Filter BOCD,Kalman Filter CUSUM,Recursive ridge regression BOCD,Recursive ridge regression CUSUM
0,MSE,mean,56.831,56.831,128.631,128.631,66.967,66.967,1063.72,1063.72
1,MSE,median,55.674,55.674,83.495,83.495,64.798,64.798,289.335,289.335
2,RMSE,mean,7.454,7.454,10.509,10.509,8.057,8.057,27.859,27.859
3,RMSE,median,7.461,7.461,9.138,9.138,8.05,8.05,17.01,17.01
4,MAE,mean,5.557,5.557,8.202,8.202,6.304,6.304,20.513,20.513
5,MAE,median,5.667,5.667,7.341,7.341,6.338,6.338,12.939,12.939
6,MAPE,mean,9.022,9.022,13.148,13.148,10.313,10.313,32.139,32.139
7,MAPE,median,9.226,9.226,11.842,11.842,10.586,10.586,18.521,18.521


(8, 10)
\begin{tabular}{llrrrrrrrr}
\toprule
metric  &   agg  &  Bayesian LR BOCD &  Bayesian LR CUSUM &  Gaussian process regression BOCD &  Gaussian process regression CUSUM &  Kalman Filter BOCD &  Kalman Filter CUSUM &  Recursive ridge regression BOCD &  Recursive ridge regression CUSUM \\
\midrule
    MSE &   mean &            56.831 &             56.831 &                           128.631 &                            128.631 &              66.967 &               66.967 &                         1063.720 &                          1063.720 \\
    MSE & median &            55.674 &             55.674 &                            83.495 &                             83.495 &              64.798 &               64.798 &                          289.335 &                           289.335 \\
   RMSE &   mean &             7.454 &              7.454 &                            10.509 &                             10.509 &               8.057 &                8.057 &                   

Unnamed: 0,composer,piece,reg_model,change_model,MSE,RMSE,MAE,MAPE
0,kapustin,variations,Kalman Filter,CUSUM,70.721,8.41,6.515,10.586
1,kapustin,variations,Kalman Filter,BOCD,70.721,8.41,6.515,10.586
2,kapustin,variations,Bayesian LR,CUSUM,89.785,9.475,7.037,11.353
3,kapustin,variations,Bayesian LR,BOCD,89.785,9.475,7.037,11.353
4,kapustin,variations,Recursive ridge regression,CUSUM,241.159,15.529,11.375,17.973
5,kapustin,variations,Recursive ridge regression,BOCD,241.159,15.529,11.375,17.973
6,kapustin,variations,Gaussian process regression,CUSUM,77.017,8.776,7.097,11.795
7,kapustin,variations,Gaussian process regression,BOCD,77.017,8.776,7.097,11.795
0,Bach,BWV849-01,Kalman Filter,CUSUM,40.75,6.384,4.955,8.078
1,Bach,BWV849-01,Kalman Filter,BOCD,40.75,6.384,4.955,8.078


(72, 8)
\begin{tabular}{llllrrrr}
\toprule
    composer &                          piece &                   reg\_model & change\_model &      MSE &   RMSE &    MAE &   MAPE \\
\midrule
    kapustin &                     variations &               Kalman Filter &        CUSUM &   70.721 &  8.410 &  6.515 & 10.586 \\
    kapustin &                     variations &               Kalman Filter &         BOCD &   70.721 &  8.410 &  6.515 & 10.586 \\
    kapustin &                     variations &                 Bayesian LR &        CUSUM &   89.785 &  9.475 &  7.037 & 11.353 \\
    kapustin &                     variations &                 Bayesian LR &         BOCD &   89.785 &  9.475 &  7.037 & 11.353 \\
    kapustin &                     variations &  Recursive ridge regression &        CUSUM &  241.159 & 15.529 & 11.375 & 17.973 \\
    kapustin &                     variations &  Recursive ridge regression &         BOCD &  241.159 & 15.529 & 11.375 & 17.973 \\
    kapustin &        

In [62]:
# """ Summary of error (cumulative) - use error paths"""
# import fnmatch
# import os

# matches = []
# for root, dirnames, filenames in os.walk('results'):
#     for filename in fnmatch.filter(filenames, 'summary_error_cumulative.csv'):
#         matches.append(os.path.join(root, filename))
# print("\n".join(matches))

# all_summary_error = []
# for match in matches:
#     raise Exception()
#     df_curr = pd.read_csv(match, header=[0,1], skiprows=1)
#     df_curr.columns = ["model_no", "reg_model", "change_model", "n", "MSE", "RMSE", "MAE", "MAPE", "analysis_key"]
#     all_summary_error.append(df_curr)

# df_all_summary_error = pd.concat(all_summary_error)
# display(df_all_summary_error.head(25))
# print(df_all_summary_error.shape)