In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import librosa
from tqdm import tqdm_notebook as tqdm
import json
import pandas as pd
import pickle
import os
import h5py
import collections as cl
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
import random
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
param_grid = {'C': [10**k for k in range(-10, 6, 2)]}  
scoring = 'roc_auc'; cv = 3; random_state=42
random.seed(random_state)

In [3]:
with open("genre_map.json", "r") as f: # only consider 10 classes of Openmic dataset
    genre_map = json.load(f)
aligned_genre = list(genre_map)[:-1]

with open("instrument_map.json", "r") as f: # only consider 10 classes of Openmic dataset
    instrument_map = json.load(f)

In [4]:
# use a Pandas DataFrame to record all results and save into a csv file later
result_all = pd.DataFrame({'instrument': [],
                          'train_set': [],
                          'test_set': [],
                          'precision': [],
                          'recall': [],
                          'f1-score': [],
                          'support': [],
                          'accuracy': [],
                          'roc_auc': [],
                          'ap': [],
                          'embedding': [],
                         })

# Load embeddings and align classes

In [5]:
embedding = 'vggish'
debias_method = ''

# load embeddings
embeddings = h5py.File("embeddings.h5", "r")

In [6]:
###### IRMAS data ######
feature = np.array(embeddings["irmas"][embedding]["features"])
keys_ori = np.array(embeddings["irmas"][embedding]["keys"])
key_clip = np.unique(keys_ori)

feature_clip = []

for key in tqdm(key_clip):
    feature_clip.append(np.mean(feature[keys_ori[:]==key,:],axis=0))
    
feature_clip = np.array(feature_clip)
print(feature_clip.shape, key_clip.shape)

  0%|          | 0/6705 [00:00<?, ?it/s]

(6705, 128) (6705,)


In [7]:
def load_feature(meta, feature_clip, key_clip):
    key_train = list(meta.loc[(meta['split'] == 'train')]['file_name'])
    key_test = list(meta.loc[(meta['split'] == 'test')]['file_name'])

    idx_train = [list(key_clip).index(item) for item in key_train]
    idx_test = [list(key_clip).index(item) for item in key_test]
   
    # cast the idx_* arrays to numpy structures
    idx_train = np.asarray(idx_train)
    idx_test = np.asarray(idx_test)

    # use the split indices to partition the features, labels, and masks
    X_train = feature_clip[idx_train,:]
    X_test = feature_clip[idx_test]

    Y_train = meta.loc[(meta['split'] == 'train')]['instrument'].values
    Y_test = meta.loc[(meta['split'] == 'test')]['instrument'].values

    genre_train = meta.loc[(meta['split'] == 'train')]['genre'].values
    genre_test = meta.loc[(meta['split'] == 'test')]['genre'].values

    return (X_train, Y_train), (X_test, Y_test), (genre_train, genre_test)

In [8]:
A_meta = pd.read_csv("A_meta.csv")
(X_train_A, Y_train_A), (X_test_A, Y_test_A), (genre_train_A, genre_test_A) = load_feature(A_meta, feature_clip, key_clip)
B_meta = pd.read_csv("B_meta.csv")
(X_train_B, Y_train_B), (X_test_B, Y_test_B), (genre_train_B, genre_test_B) = load_feature(B_meta, feature_clip, key_clip)

# A -> A

In [9]:
# This part of the code follows the baseline model for instrument recognition on the openmic dataset:
# https://github.com/cosmir/openmic-2018/blob/master/examples/modeling-baseline.ipynb
train_set_name, test_set_name = 'A', 'A' 

globals()['models_' + train_set_name] = dict()

# We'll iterate over all istrument classes, and fit a model for each one
# After training, we'll print a classification report for each instrument
for instrument in instrument_map:
    
    Y_train_A_inst = Y_train_A==instrument
    Y_train_B_inst = Y_train_B==instrument

    Y_test_A_inst = Y_test_A==instrument
    Y_test_B_inst = Y_test_B==instrument

    X_train_inst = X_train_A
    Y_train_inst = Y_train_A_inst
    X_test_inst = X_test_A
    Y_test_inst = Y_test_A_inst

    # initialize and a logistic regression model
    LRmodel = LogisticRegression(random_state=random_state, penalty='l2', solver='liblinear', class_weight='balanced')
    
    # hyperparameter tunning for logistic regression model
    clf =  GridSearchCV(LRmodel, param_grid=param_grid, cv=cv, scoring=scoring)    
    
    # fit the model
    clf.fit(X_train_inst, Y_train_inst)

    # predict
    Y_pred_inst = clf.predict(X_test_inst)
    # Get prediction scores for the positive class
    Y_pred_scores = clf.predict_proba(X_test_inst)[:, 1]
    
    # store the classifier in the model dictionary
    globals()['models_' + train_set_name][instrument] = clf

    # print result for each instrument
    print('-' * 52); print(instrument); print('\tTEST')
    print(classification_report(Y_test_inst, Y_pred_inst))
    
    model_auc = roc_auc_score(Y_test_inst, Y_pred_scores)
    model_ap = average_precision_score(Y_test_inst, Y_pred_scores)
    print(f'ROC-AUC = {model_auc:.3f}\t\tAP = {model_ap:.3f}')
    
    # record the result for each instrument
    report = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['True']
    report['roc_auc'] = model_auc
    report['ap'] = model_ap
    report_accuracy = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['accuracy'][-2]
    result_inst = [instrument, train_set_name, test_set_name, report['precision'], report['recall'],
                   report['f1-score'], report['support'], report_accuracy, model_auc, model_ap, embedding + debias_method]   
    result_all = result_all.append(pd.DataFrame(np.expand_dims(np.array(result_inst), axis=0), 
                                                columns=result_all.columns), ignore_index=True)

with open('models/models_' + train_set_name + '_' + embedding + debias_method + '.pickle', 'wb') as fdesc:
    pickle.dump(globals()['models_'+train_set_name], fdesc)

----------------------------------------------------
cel
	TEST
              precision    recall  f1-score   support

       False       0.99      0.91      0.95       728
        True       0.25      0.70      0.37        30

    accuracy                           0.91       758
   macro avg       0.62      0.81      0.66       758
weighted avg       0.96      0.91      0.93       758

ROC-AUC = 0.923		AP = 0.466
----------------------------------------------------
cla
	TEST
              precision    recall  f1-score   support

       False       0.99      0.86      0.92       723
        True       0.23      0.86      0.37        35

    accuracy                           0.86       758
   macro avg       0.61      0.86      0.65       758
weighted avg       0.96      0.86      0.90       758

ROC-AUC = 0.935		AP = 0.543
----------------------------------------------------
flu
	TEST
              precision    recall  f1-score   support

       False       0.98      0.86      0.92   

# A -> B

In [10]:
# This part of the code follows the baseline model for instrument recognition on the openmic dataset:
# https://github.com/cosmir/openmic-2018/blob/master/examples/modeling-baseline.ipynb
train_set_name, test_set_name = 'A', 'B' 

# We'll iterate over all istrument classes, and fit a model for each one
# After training, we'll print a classification report for each instrument
for instrument in instrument_map:
    
    Y_train_A_inst = Y_train_A==instrument
    Y_train_B_inst = Y_train_B==instrument

    Y_test_A_inst = Y_test_A==instrument
    Y_test_B_inst = Y_test_B==instrument

    X_train_inst = X_train_A
    Y_train_inst = Y_train_A_inst
    X_test_inst = X_test_B
    Y_test_inst = Y_test_B_inst

    # initialize and a logistic regression model
    LRmodel = LogisticRegression(random_state=random_state, penalty='l2', solver='liblinear', class_weight='balanced')
    
    # hyperparameter tunning for logistic regression model
    clf =  GridSearchCV(LRmodel, param_grid=param_grid, cv=cv, scoring=scoring)    
    
    # fit the model
    clf.fit(X_train_inst, Y_train_inst)

    # predict
    Y_pred_inst = clf.predict(X_test_inst)
    # Get prediction scores for the positive class
    Y_pred_scores = clf.predict_proba(X_test_inst)[:, 1]

    # print result for each instrument
    print('-' * 52); print(instrument); print('\tTEST')
    print(classification_report(Y_test_inst, Y_pred_inst))
    
    model_auc = roc_auc_score(Y_test_inst, Y_pred_scores)
    model_ap = average_precision_score(Y_test_inst, Y_pred_scores)
    print(f'ROC-AUC = {model_auc:.3f}\t\tAP = {model_ap:.3f}')
    
    # record the result for each instrument
    report = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['True']
    report['roc_auc'] = model_auc
    report['ap'] = model_ap
    report_accuracy = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['accuracy'][-2]
    result_inst = [instrument, train_set_name, test_set_name, report['precision'], report['recall'],
                   report['f1-score'], report['support'], report_accuracy, model_auc, model_ap, embedding + debias_method]   
    result_all = result_all.append(pd.DataFrame(np.expand_dims(np.array(result_inst), axis=0), 
                                                columns=result_all.columns), ignore_index=True)

----------------------------------------------------
cel
	TEST
              precision    recall  f1-score   support

       False       0.98      0.88      0.93       867
        True       0.33      0.78      0.46        68

    accuracy                           0.87       935
   macro avg       0.66      0.83      0.70       935
weighted avg       0.93      0.87      0.89       935

ROC-AUC = 0.910		AP = 0.545
----------------------------------------------------
cla
	TEST
              precision    recall  f1-score   support

       False       0.97      0.62      0.76       843
        True       0.19      0.83      0.31        92

    accuracy                           0.64       935
   macro avg       0.58      0.72      0.53       935
weighted avg       0.89      0.64      0.71       935

ROC-AUC = 0.795		AP = 0.351
----------------------------------------------------
flu
	TEST
              precision    recall  f1-score   support

       False       0.99      0.80      0.88   

# B -> B

In [11]:
# This part of the code follows the baseline model for instrument recognition on the openmic dataset:
# https://github.com/cosmir/openmic-2018/blob/master/examples/modeling-baseline.ipynb
train_set_name, test_set_name = 'B', 'B' 

globals()['models_' + train_set_name] = dict()

# We'll iterate over all istrument classes, and fit a model for each one
# After training, we'll print a classification report for each instrument
for instrument in instrument_map:
    
    Y_train_A_inst = Y_train_A==instrument
    Y_train_B_inst = Y_train_B==instrument

    Y_test_A_inst = Y_test_A==instrument
    Y_test_B_inst = Y_test_B==instrument

    X_train_inst = X_train_B
    Y_train_inst = Y_train_B_inst
    X_test_inst = X_test_B
    Y_test_inst = Y_test_B_inst

    # initialize and a logistic regression model
    LRmodel = LogisticRegression(random_state=random_state, penalty='l2', solver='liblinear', class_weight='balanced')
    
    # hyperparameter tunning for logistic regression model
    clf =  GridSearchCV(LRmodel, param_grid=param_grid, cv=cv, scoring=scoring)    
    
    # fit the model
    clf.fit(X_train_inst, Y_train_inst)

    # predict
    Y_pred_inst = clf.predict(X_test_inst)
    # Get prediction scores for the positive class
    Y_pred_scores = clf.predict_proba(X_test_inst)[:, 1]
    
    # store the classifier in the model dictionary
    globals()['models_' + train_set_name][instrument] = clf

    # print result for each instrument
    print('-' * 52); print(instrument); print('\tTEST')
    print(classification_report(Y_test_inst, Y_pred_inst))
    
    model_auc = roc_auc_score(Y_test_inst, Y_pred_scores)
    model_ap = average_precision_score(Y_test_inst, Y_pred_scores)
    print(f'ROC-AUC = {model_auc:.3f}\t\tAP = {model_ap:.3f}')
    
    # record the result for each instrument
    report = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['True']
    report['roc_auc'] = model_auc
    report['ap'] = model_ap
    report_accuracy = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['accuracy'][-2]
    result_inst = [instrument, train_set_name, test_set_name, report['precision'], report['recall'],
                   report['f1-score'], report['support'], report_accuracy, model_auc, model_ap, embedding + debias_method]   
    result_all = result_all.append(pd.DataFrame(np.expand_dims(np.array(result_inst), axis=0), 
                                                columns=result_all.columns), ignore_index=True)

with open('models/models_' + train_set_name + '_' + embedding + debias_method + '.pickle', 'wb') as fdesc:
    pickle.dump(globals()['models_'+train_set_name], fdesc)

----------------------------------------------------
cel
	TEST
              precision    recall  f1-score   support

       False       0.99      0.87      0.93       867
        True       0.35      0.88      0.50        68

    accuracy                           0.87       935
   macro avg       0.67      0.88      0.71       935
weighted avg       0.94      0.87      0.90       935

ROC-AUC = 0.927		AP = 0.686
----------------------------------------------------
cla
	TEST
              precision    recall  f1-score   support

       False       0.96      0.76      0.85       843
        True       0.25      0.74      0.37        92

    accuracy                           0.75       935
   macro avg       0.61      0.75      0.61       935
weighted avg       0.89      0.75      0.80       935

ROC-AUC = 0.833		AP = 0.393
----------------------------------------------------
flu
	TEST
              precision    recall  f1-score   support

       False       0.98      0.88      0.93   

# B -> A

In [12]:
# This part of the code follows the baseline model for instrument recognition on the openmic dataset:
# https://github.com/cosmir/openmic-2018/blob/master/examples/modeling-baseline.ipynb
train_set_name, test_set_name = 'B', 'A' 

# We'll iterate over all istrument classes, and fit a model for each one
# After training, we'll print a classification report for each instrument
for instrument in instrument_map:
    
    Y_train_A_inst = Y_train_A==instrument
    Y_train_B_inst = Y_train_B==instrument

    Y_test_A_inst = Y_test_A==instrument
    Y_test_B_inst = Y_test_B==instrument

    X_train_inst = X_train_B
    Y_train_inst = Y_train_B_inst
    X_test_inst = X_test_A
    Y_test_inst = Y_test_A_inst

    # initialize and a logistic regression model
    LRmodel = LogisticRegression(random_state=random_state, penalty='l2', solver='liblinear', class_weight='balanced')
    
    # hyperparameter tunning for logistic regression model
    clf =  GridSearchCV(LRmodel, param_grid=param_grid, cv=cv, scoring=scoring)    
    
    # fit the model
    clf.fit(X_train_inst, Y_train_inst)

    # predict
    Y_pred_inst = clf.predict(X_test_inst)
    # Get prediction scores for the positive class
    Y_pred_scores = clf.predict_proba(X_test_inst)[:, 1]

    # print result for each instrument
    print('-' * 52); print(instrument); print('\tTEST')
    print(classification_report(Y_test_inst, Y_pred_inst))
    
    model_auc = roc_auc_score(Y_test_inst, Y_pred_scores)
    model_ap = average_precision_score(Y_test_inst, Y_pred_scores)
    print(f'ROC-AUC = {model_auc:.3f}\t\tAP = {model_ap:.3f}')
    
    # record the result for each instrument
    report = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['True']
    report['roc_auc'] = model_auc
    report['ap'] = model_ap
    report_accuracy = pd.DataFrame(classification_report(Y_test_inst, Y_pred_inst, output_dict=True))['accuracy'][-2]
    result_inst = [instrument, train_set_name, test_set_name, report['precision'], report['recall'],
                   report['f1-score'], report['support'], report_accuracy, model_auc, model_ap, embedding + debias_method]   
    result_all = result_all.append(pd.DataFrame(np.expand_dims(np.array(result_inst), axis=0), 
                                                columns=result_all.columns), ignore_index=True)

----------------------------------------------------
cel
	TEST
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       728
        True       0.18      0.17      0.17        30

    accuracy                           0.94       758
   macro avg       0.57      0.57      0.57       758
weighted avg       0.93      0.94      0.94       758

ROC-AUC = 0.731		AP = 0.123
----------------------------------------------------
cla
	TEST
              precision    recall  f1-score   support

       False       0.98      0.94      0.96       723
        True       0.32      0.54      0.40        35

    accuracy                           0.93       758
   macro avg       0.65      0.74      0.68       758
weighted avg       0.95      0.93      0.93       758

ROC-AUC = 0.761		AP = 0.319
----------------------------------------------------
flu
	TEST
              precision    recall  f1-score   support

       False       0.98      0.90      0.93   

# save result

In [13]:
# save all result into a csv file
# result_all.to_csv('results/result_' + embedding + debias_method + '_trainboth.csv', index=False)
result_all.to_csv('results/result_' + embedding +  debias_method + '.csv', index=False)

In [14]:
# # save all result into a csv file
# debias_method = ''
# df_1 = pd.read_csv('results/result_' + 'vggish' + '.csv')
# df_2 = pd.read_csv('results/result_' + 'openl3' + '.csv')
# df_3 = pd.read_csv('results/result_' + 'yamnet' + '.csv')

# df = df_1.append(df_2).append(df_3)
# df.to_csv('results/result_' + '3embedding' + debias_method + '.csv', index=False)

# df = pd.read_csv('results/result_' + '3embedding' + debias_method + '.csv')

# sns.set_theme(context='notebook', style='whitegrid', palette='deep')

# sns.catplot(data=df, row='train_set', col='test_set', y='instrument', x='roc_auc', hue='embedding', kind='bar', 
#             height=4, aspect=1.2)
# plt.xlim([.4,1])
# plt.savefig('results/' + '3embedding' + debias_method + '_ori_sameratio.pdf')