# PureMic Dataset Baseline

In [12]:
import json
import numpy as np
import pandas as pd
import os
import pickle


from tqdm import tqdm
import collections
import matplotlib.pyplot as plt
import matplotlib



import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam,SGD

from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,average_precision_score, precision_recall_curve, auc, make_scorer, recall_score, precision_score, confusion_matrix

DATA_ROOT='D:\CastelBranco\PAPER'
all_files=True

## Loading the data
The puremic data is provided in a python-friendly format just like openmic-2018:

In [13]:
PUREMIC = np.load(os.path.join(DATA_ROOT, 'PureMic.npz'))

In [14]:
print(list(PUREMIC.keys()))

['X', 'Y', 'sample_key']


## What's included in the data?
- X: 20000 91 128 array of VGGish features
    - First index (0..1049) corresponds to the sample key
    - Second index (0..90) corresponds to the 91, 100 ms hop frames (each time slice is 960 ms long). 
    - Third index (0..127) corresponds to the VGGish features at each point in the 10sec clip
    - Example X[40, 8] is the 128-dimensional feature vector for the 9th time slice in the 41st example

- Y_true: 1050 21 one hot encoded label array
    - First index corresponds to sample key, as above
    - Second index corresponds to the label class (accordion, ..., zilence)
    - Example: Y[40, 4] indicates the confidence that example #41 contains the 5th instrument

- sample_key: 1050 array of sample key strings
    - Example: sample_key[40] is the sample key for example #41

In [16]:
X_PureMic, Y_PureMic, sample_key_PureMic = PUREMIC['X'], PUREMIC['Y'], PUREMIC['sample_key']

In [19]:
with open(os.path.join(DATA_ROOT, 'class-map.json'), 'r') as f:
    class_map = json.load(f)

#%%
classes=[]
for value in class_map:
    classes.append(value)
    
classes=np.array(classes)
print(class_map)

{'accordion': 0, 'banjo': 1, 'bass': 2, 'cello': 3, 'clarinet': 4, 'cymbals': 5, 'drums': 6, 'flute': 7, 'guitar': 8, 'mallet_percussion': 9, 'mandolin': 10, 'organ': 11, 'piano': 12, 'saxophone': 13, 'synthesizer': 14, 'trombone': 15, 'trumpet': 16, 'ukulele': 17, 'violin': 18, 'voice': 19, 'zilence': 20}


# Loading train and test splits
PureMic also provides a pre-defined train-test split. The sets are perfectly balanced.

In [20]:
split_train = pd.read_csv(os.path.join(DATA_ROOT,'train_split.csv'), 
                          header=None, squeeze=True)

split_test = pd.read_csv(os.path.join(DATA_ROOT,'test_split.csv'), 
                          header=None, squeeze=True)

train_set = set(split_train)
test_set = set(split_test)


idx_train, idx_test = [], []

for idx, n in enumerate(sample_key_PureMic):
    if n in train_set:
        idx_train.append(idx)
    elif n in test_set:
        idx_test.append(idx)
    else:
        # This should never happen, but better safe than sorry.
        raise RuntimeError('Unknown sample key={}! Abort!'.format(sample_key_PureMic[n]))
        
# Finally, cast the idx_* arrays to numpy structures
idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)


# Finally, we use the split indices to partition the features, labels, and masks
X_train_pm = X_PureMic[idx_train]
X_test_pm = X_PureMic[idx_test]

Y_train_pm = Y_PureMic[idx_train]
Y_test_pm = Y_PureMic[idx_test]

In [22]:
# Number of train and test examples (80%/20%)
print('# Train: {},  # Test: {}'.format(len(split_train), len(split_test)))

# Train: 840,  # Test: 210


In [21]:
print(X_train_pm.shape)
print(X_test_pm.shape)

(840, 91, 128)
(210, 91, 128)


In [24]:
X_PM_MEAN_TRAIN = np.mean(X_train_pm, axis=1)
X_PM_MEAN_TEST = np.mean(X_test_pm, axis=1)

In [25]:
#FIT NN_MEAN 
tf.keras.backend.clear_session()
tf.reset_default_graph()

num_classes=21
img_input = Input(shape=(128,))
n=Dense(512, activation='sigmoid')(img_input)
n=Dense(256, activation='sigmoid')(n)
n=Dense(num_classes, activation='softmax')(n)
NN_MEAN=Model(img_input,n)

opt= SGD(lr=0.001)
NN_MEAN.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])


monitor = EarlyStopping(monitor='val_loss', 
                        min_delta=1e-3, 
                        patience=8, verbose=2, mode='auto')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, 
                                            verbose=0, mode='auto', 
                                            min_delta=0.0001, cooldown=0, min_lr=0)
if all_files:
    NN_MEAN.load_weights(os.path.join(DATA_ROOT,'models','NN_MEAN.h5'))
else:
    NN_MEAN.fit(X_PM_MEAN_TRAIN,Y_train_pm,validation_data=(X_PM_MEAN_TEST,Y_test_pm),callbacks=[monitor,reduce_lr],verbose=2,epochs=2000)
    NN_MEAN.save_weights(os.path.join(DATA_ROOT,'models','NN_MEAN.h5'))    

scores_pm_mean=NN_MEAN.predict_on_batch(X_PM_MEAN_TEST)

print(classification_report(np.argmax(Y_test_pm,axis=1), np.argmax(scores_pm_mean,axis=1),target_names=classes))

                   precision    recall  f1-score   support

        accordion       1.00      1.00      1.00        10
            banjo       1.00      1.00      1.00        10
             bass       1.00      1.00      1.00        10
            cello       1.00      0.90      0.95        10
         clarinet       1.00      0.90      0.95        10
          cymbals       1.00      0.70      0.82        10
            drums       0.80      0.80      0.80        10
            flute       1.00      1.00      1.00        10
           guitar       0.91      1.00      0.95        10
mallet_percussion       0.91      1.00      0.95        10
         mandolin       1.00      0.90      0.95        10
            organ       1.00      1.00      1.00        10
            piano       1.00      0.90      0.95        10
        saxophone       0.90      0.90      0.90        10
      synthesizer       0.91      1.00      0.95        10
         trombone       1.00      1.00      1.00       

In [26]:
#%% Create PM_ALL

X_PM_ALL=[]
Y_PM_ALL=[]
samp_info_all_pm=[]

for idx, clip in enumerate(X_PureMic):
    for sec, instance in enumerate(clip):
        X_PM_ALL.append(instance)
        Y_PM_ALL.append(Y_PureMic[idx])
        samp_info_all_pm.append([sample_key_PureMic[idx],sec])
            
X_PM_ALL=np.array(X_PM_ALL)
Y_PM_ALL=np.array(Y_PM_ALL)
samp_info_all_pm=np.array(samp_info_all_pm)

print('Size of PM_ALL: {}'.format(len(X_PM_ALL)))

Size of PM_ALL: 95550


In [29]:
#%% convert one hot enconded to categorical so we can see the class distribution
pm_labels=[]

for n in Y_PM_ALL:
    inst_class=classes[np.where(n==1)[0][0]]
    pm_labels.append(inst_class)
    
collections.Counter(pm_labels)

Counter({'accordion': 4550,
         'banjo': 4550,
         'bass': 4550,
         'cello': 4550,
         'clarinet': 4550,
         'cymbals': 4550,
         'drums': 4550,
         'flute': 4550,
         'guitar': 4550,
         'mallet_percussion': 4550,
         'mandolin': 4550,
         'organ': 4550,
         'piano': 4550,
         'saxophone': 4550,
         'synthesizer': 4550,
         'trombone': 4550,
         'trumpet': 4550,
         'ukulele': 4550,
         'violin': 4550,
         'voice': 4550,
         'zilence': 4550})

In [30]:
#get PM_ALL scores with NN_MEAN
scores_pm_all=NN_MEAN.predict(X_PM_ALL)

In [32]:
#create PM_1
thresh_sil=0.5
thresh_up=0.2

sil_label=np.zeros(21,dtype=np.int32)
sil_label[20]=1

X_PM1=[]
Y_PM1=[]
N_PM1=[]

for idx in range(len(X_PM_ALL)):    
    
    if scores_pm_all[idx,20]>thresh_sil: #look for silence instances in all classes
        X_PM1.append(X_PM_ALL[idx])
        Y_PM1.append(sil_label)
        N_PM1.append(samp_info_all_pm[idx])
    
    else:
        classe=np.where(Y_PM_ALL[idx]==1)[0][0] #get the class index of current instance
        proba=scores_pm_all[idx,classe] #get the score only of the corresponding class
        if proba > thresh_up:           #even belonging to same classe the activation should be greater than thresh_up
            X_PM1.append(X_PM_ALL[idx])
            Y_PM1.append(Y_PM_ALL[idx])
            N_PM1.append(samp_info_all_pm[idx])
        

X_PM1=np.array(X_PM1)
Y_PM1=np.array(Y_PM1)
N_PM1=np.array(N_PM1)

print('Size of PM_ALL: {}'.format(len(X_PM1)))

Size of PM_ALL: 84020


In [33]:
#convert one hot enconded to categorical so we can see the class distribution
#this time, the number of instances should increase for silence class and decrease to the remaining
pm1_labels=[]

for n in Y_PM1:
    inst_class=classes[np.where(n==1)[0][0]]
    pm1_labels.append(inst_class)
    
collections.Counter(pm1_labels)

Counter({'accordion': 4273,
         'zilence': 4809,
         'banjo': 4126,
         'bass': 4020,
         'cello': 3673,
         'clarinet': 3691,
         'cymbals': 3781,
         'drums': 4156,
         'flute': 4145,
         'guitar': 4052,
         'mallet_percussion': 4036,
         'mandolin': 4188,
         'organ': 3995,
         'piano': 4053,
         'saxophone': 3273,
         'synthesizer': 3945,
         'trombone': 3901,
         'trumpet': 3683,
         'ukulele': 3932,
         'violin': 3981,
         'voice': 4307})

In [34]:
#split again in train-test, this time with labels per instance and not per clip
idx_train, idx_test = [], []

for idx, n in enumerate(N_PM1):
    if n[0] in train_set:
        idx_train.append(idx)
    elif n[0] in test_set:
        idx_test.append(idx)
    else:
        raise RuntimeError('Unknown sample key={}! Abort!'.format(sample_key_PureMic[n]))
        
idx_train_pm1 = np.asarray(idx_train)
idx_test_pm1 = np.asarray(idx_test)

X_PM1_TRAIN=X_PM1[idx_train_pm1]
Y_PM1_TRAIN=Y_PM1[idx_train_pm1]

X_PM1_TEST=X_PM1[idx_test_pm1]
Y_PM1_TEST=Y_PM1[idx_test_pm1]

In [35]:
tf.keras.backend.clear_session()
tf.reset_default_graph()

num_classes=21
img_input = Input(shape=(128,))
n=Dense(4096, activation='sigmoid')(img_input)
n=Dense(2048, activation='sigmoid')(n)
n=Dense(num_classes, activation='softmax')(n)
NN_ALL=Model(img_input,n)

opt = Adam(lr=0.001, decay=1e-6)
opt= SGD(lr=0.001)
NN_ALL.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])


monitor = EarlyStopping(monitor='val_loss', 
                        min_delta=1e-3, 
                        patience=8, verbose=2, mode='auto')

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, 
                                            verbose=0, mode='auto', 
                                            min_delta=0.0001, cooldown=0, min_lr=0)

if all_files:
    NN_ALL.load_weights(os.path.join(DATA_ROOT,'models','NN_ALL.h5'))
    with open(os.path.join(DATA_ROOT,'scores','scores_pm_all'), "rb") as fp:   # Unpickling
            scores_pm_all = pickle.load(fp)
else:
    NN_ALL.fit(X_PM1_TRAIN,Y_PM1_TRAIN,validation_data=(X_PM1_TEST,Y_PM1_TEST),callbacks=[monitor,reduce_lr],verbose=2,epochs=1000)
    NN_ALL.save_weights(os.path.join(DATA_ROOT,'models','NN_ALL.h5'))
    scores_pm_all=NN_ALL.predict(X_PM1_TEST)
    with open(os.path.join(DATA_ROOT,'scores','scores_pm_all'), "wb") as fp:   # Unpickling
        pickle.dump(scores_pm_all, fp,protocol=4)
        



print(classification_report(np.argmax(Y_PM1_TEST,axis=1), np.argmax(scores_pm_all,axis=1),target_names=classes))



                   precision    recall  f1-score   support

        accordion       0.96      0.99      0.98       887
            banjo       1.00      0.99      1.00       799
             bass       0.96      0.97      0.96       793
            cello       0.94      0.94      0.94       682
         clarinet       0.94      0.89      0.91       703
          cymbals       0.98      0.93      0.96       567
            drums       0.95      0.98      0.96       699
            flute       0.97      0.99      0.98       870
           guitar       0.99      0.86      0.92       735
mallet_percussion       0.99      0.96      0.98       776
         mandolin       0.96      0.94      0.95       822
            organ       0.96      1.00      0.98       736
            piano       0.96      0.98      0.97       678
        saxophone       0.85      0.94      0.89       578
      synthesizer       0.96      0.97      0.97       799
         trombone       0.93      0.95      0.94       

## PM2
PM2 is the final dataset as explained in (ref paper).
PM2 has examples from AudioSet (without the clips of PureMic) and the training set of OpenMIC. As those dataset are very large we only realease the final result of PM2:

In [51]:

with open(os.path.join(DATA_ROOT,'PM2'), "rb") as fp:   # Unpickling
            X_PM2,Y_PM2 = pickle.load(fp)

In [52]:
#number of instances per class
collections.Counter(Y_PM2)

Counter({'accordion': 50000,
         'banjo': 50000,
         'bass': 27536,
         'cello': 50000,
         'clarinet': 28355,
         'cymbals': 35886,
         'drums': 50000,
         'flute': 50000,
         'guitar': 50000,
         'mallet_percussion': 50000,
         'mandolin': 50000,
         'organ': 50000,
         'piano': 50000,
         'saxophone': 50000,
         'synthesizer': 50000,
         'trombone': 50000,
         'trumpet': 50000,
         'ukulele': 50000,
         'violin': 50000,
         'voice': 50000,
         'zilence': 50000})

In [53]:



from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
PM2_y_binary = encoder.fit_transform(Y_PM2)

tf.keras.backend.clear_session()
tf.reset_default_graph()

num_classes=21
img_input = Input(shape=(128,))
n=Dense(4096, activation='sigmoid')(img_input)
n=Dense(2048, activation='sigmoid')(n)
n=Dense(num_classes, activation='softmax')(n)
NN_FINAL=Model(img_input,n)

opt = Adam(lr=0.001, decay=1e-6)
opt= SGD(lr=0.001)
NN_FINAL.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])


monitor = EarlyStopping(monitor='val_loss', 
                        min_delta=1e-3, 
                        patience=5, verbose=2, mode='auto')

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, 
                                            verbose=0, mode='auto', 
                                            min_delta=0.0001, cooldown=0, min_lr=0)

if all_files:
    NN_FINAL.load_weights(os.path.join(DATA_ROOT,'models','NN_FINAL.h5'))
    with open(os.path.join(DATA_ROOT,'scores','scores_pm_final'), "rb") as fp:   # Unpickling
            scores_pm_final = pickle.load(fp)
    
else:
    NN_FINAL.fit(X_PM2,PM2_y_binary,validation_data=(X_PM1_TEST,Y_PM1_TEST),callbacks=[monitor,reduce_lr],verbose=2,epochs=1000)
    NN_FINAL.save_weights(os.path.join(DATA_ROOT,'models','NN_FINAL.h5'))
    scores_pm_final=NN_FINAL.predict(X_PM1_TEST)
    with open(os.path.join(DATA_ROOT,'scores','scores_pm_final'), "wb") as fp:   # Unpickling
        pickle.dump(scores_pm_final, fp,protocol=4)


print(classification_report(np.argmax(Y_PM1_TEST,axis=1),np.argmax(scores_pm_final,axis=1),target_names=classes))            


                   precision    recall  f1-score   support

        accordion       0.97      0.99      0.98       887
            banjo       0.99      1.00      1.00       799
             bass       0.96      0.96      0.96       793
            cello       0.95      0.93      0.94       682
         clarinet       0.92      0.89      0.91       703
          cymbals       0.98      0.93      0.96       567
            drums       0.94      0.98      0.96       699
            flute       0.97      0.99      0.98       870
           guitar       0.99      0.91      0.95       735
mallet_percussion       0.99      0.98      0.99       776
         mandolin       0.96      0.94      0.95       822
            organ       0.97      1.00      0.99       736
            piano       0.96      0.99      0.97       678
        saxophone       0.87      0.91      0.89       578
      synthesizer       0.98      0.97      0.97       799
         trombone       0.94      0.95      0.95       

In [47]:
#Openmic missing labels
OPENMIC = np.load(os.path.join(DATA_ROOT,'OpenMIC', 'openmic-2018.npz'))
X_om_final, Y_true_om_final, Y_mask_om_final, sample_key_om_final = OPENMIC['X'], OPENMIC['Y_true'], OPENMIC['Y_mask'], OPENMIC['sample_key']

split_train = pd.read_csv(os.path.join(DATA_ROOT, 'OpenMIC','split01_train.csv'), 
                              header=None, squeeze=True)
split_test = pd.read_csv(os.path.join(DATA_ROOT, 'OpenMIC','split01_test.csv'), 
                     header=None, squeeze=True)

train_set = set(split_train)
test_set = set(split_test)

idx_train, idx_test = [], []

for idx, n in enumerate(OM[2]):
    if n in train_set:
        idx_train.append(idx)
    elif n in test_set:
        idx_test.append(idx)

idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)

X_train_om = X_om_final[idx_train]
X_test_om = X_om_final[idx_test]


Y_train_om = Y_true_om_final[idx_train]
Y_test_om = Y_true_om_final[idx_test]

unlabelled=0
positives=0
negatives=0
for clip in Y_true_om_final:
    for inst_class in clip:
        if inst_class == 0.5:
            unlabelled=unlabelled+1
        if inst_class > 0.5:
            positives=positives+1
        if inst_class < 0.5:
            negatives=negatives+1
            

print('{} missing labels, {} negatives, {} positives'.format(unlabelled,negatives,positives))

358732 missing labels, 23654 negatives, 17614 positives


In [41]:
#load already averaged scores
test_scores=np.load(os.path.join(DATA_ROOT,'scores','omic_test_mean_scores.npy'))
train_scores=np.load(os.path.join(DATA_ROOT,'scores','omic_train_mean_scores.npy'))
proposed_thresholds=np.load(os.path.join(DATA_ROOT,'scores','proposed_thresholds.npy'))

In [49]:
#create new labels based on proposed threshs 
#you can try different thresholds and uncoment to generate negative labels too
new_omic_y=[]
count_pos=0
count_neg=0
for idx, score in enumerate(train_scores):
    labels=[0.5]*20
    for inst in range(20):
        if Y_train_om[idx,inst] == 0.5:
            if score[inst] > proposed_thresholds[inst]:
                labels[inst]=1
                count_pos=count_pos+1
            #elif score[inst] < ###value###:
                #labels[inst]=0
                #count_neg=count_neg+1
        else:
            labels[inst]=Y_train_om[idx,inst]
    new_omic_y.append(labels)
new_omic_y=np.array(new_omic_y) 

print('{} new positive labels for train_set'.format(count_pos))

1646 new positive labels


## OpenMIC baseline
The following code will process openmic baseline with the new labels. Original baseline is available at https://github.com/cosmir/openmic-2018/blob/master/examples/modeling-baseline.ipynb so you can compare the results.
mAP (REF PAPER) of openmic baseline is **0.66**.

The new proposed labels improves the openmic **original** i.e. to validate, we only changed the training set labels keeping the test set exactly the same so we can compare the results. The new mAP is **0.68** with **1946** new positive labels.

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,recall_score,f1_score,average_precision_score
models = dict()
AP_TOT=0

for inst in range(20):
    
        
    instrument=classes[inst]
    ###TRAIN
    score_indexes=new_omic_y[:,inst] != 0.5       
    inst_y_train=new_omic_y[score_indexes,inst]
    inst_x_train=X_train_om[score_indexes]

    inst_binary_scores_train=np.zeros(len(inst_y_train))
    for idx,s in enumerate(inst_y_train):
        if s > 0.5:
            inst_binary_scores_train[idx]=1
    
    ###TEST
    score_indexes=Y_test_om[:,inst] != 0.5       
    inst_y_test=Y_test_om[score_indexes,inst]
    inst_x_test=X_test_om[score_indexes]

    inst_binary_scores_test=np.zeros(len(inst_y_test))
    for idx,s in enumerate(inst_y_test):
        if s > 0.5:
            inst_binary_scores_test[idx]=1        
            

    X_train = np.mean(inst_x_train, axis=1)
    X_test  = np.mean(inst_x_test, axis=1)
    

    clf = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    

    clf.fit(X_train, inst_binary_scores_train)


    Y_pred_train = clf.predict(X_train)
    Y_pred_test = clf.predict(X_test)
    
    print('-' * 52)
    print(instrument)
    print('\tTRAIN')
    print(classification_report(inst_binary_scores_train, Y_pred_train))
    print('\tTEST')
    print(classification_report(inst_binary_scores_test, Y_pred_test))
    
    AP=average_precision_score(inst_binary_scores_test, Y_pred_test, average=None)
    AP_TOT=AP_TOT+AP

print('mAP: {}'.format(AP_TOT/20))

----------------------------------------------------
accordion
	TRAIN
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      1159
         1.0       1.00      0.73      0.84       462

   micro avg       0.92      0.92      0.92      1621
   macro avg       0.95      0.86      0.90      1621
weighted avg       0.93      0.92      0.92      1621

	TEST
              precision    recall  f1-score   support

         0.0       0.85      0.97      0.91       423
         1.0       0.78      0.35      0.48       115

   micro avg       0.84      0.84      0.84       538
   macro avg       0.82      0.66      0.69       538
weighted avg       0.83      0.84      0.81       538


----------------------------------------------------
banjo
	TRAIN
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      1148
         1.0       0.97      0.93      0.95       614

   micro avg       0.97      0.97      


	TEST
              precision    recall  f1-score   support

         0.0       0.78      0.94      0.85       310
         1.0       0.67      0.31      0.43       121

   micro avg       0.76      0.76      0.76       431
   macro avg       0.72      0.63      0.64       431
weighted avg       0.75      0.76      0.73       431


----------------------------------------------------
piano
	TRAIN
              precision    recall  f1-score   support

         0.0       0.99      0.94      0.96       420
         1.0       0.98      1.00      0.99      1191

   micro avg       0.98      0.98      0.98      1611
   macro avg       0.99      0.97      0.98      1611
weighted avg       0.98      0.98      0.98      1611

	TEST
              precision    recall  f1-score   support

         0.0       0.99      0.78      0.88       130
         1.0       0.91      1.00      0.95       285

   micro avg       0.93      0.93      0.93       415
   macro avg       0.95      0.89      0.91     