In [44]:
import os
import sys
import math
import pickle
import librosa
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
import soundfile as sf
import vggish.vggish as vggish
import IPython.display as ipd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

tf.get_logger().setLevel('INFO')
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# OPENMIC

In [3]:
df_openmic = pd.read_csv('openmic/openmic-2018-aggregated-labels.csv')
df_openmic.head()

Unnamed: 0,sample_key,instrument,relevance,num_responses
0,000046_3840,clarinet,0.17105,3
1,000046_3840,flute,0.0,3
2,000046_3840,trumpet,0.0,3
3,000135_483840,saxophone,0.14705,3
4,000135_483840,voice,1.0,3


In [4]:
df_openmic.shape

(41534, 4)

In [5]:
df_openmic['instrument'].value_counts()

instrument
trumpet              2928
trombone             2783
mandolin             2477
ukulele              2437
clarinet             2398
saxophone            2377
banjo                2229
flute                2095
accordion            2093
violin               2047
cello                1960
organ                1901
bass                 1900
mallet_percussion    1814
drums                1759
cymbals              1749
piano                1733
guitar               1664
synthesizer          1614
voice                1576
Name: count, dtype: int64

In [50]:
instruments_openmic = np.unique(df_openmic['instrument'])

In [6]:
split_train = pd.read_csv('openmic/partitions/split01_train.csv', header=None).squeeze()
split_test = pd.read_csv('openmic/partitions/split01_test.csv', header=None).squeeze()

print(f'Train: {len(split_train)}\nTest: {len(split_test)}')

Train: 14915
Test: 5085


# MTG-Jamendo

In [7]:
df_jamendo = pd.read_csv('mtg-jamendo/selected-instruments.csv')
df_jamendo.head()

Unnamed: 0,track_id,path,instrument
0,track_0151154,54/151154.mp3,piano
1,track_1142627,27/1142627.mp3,piano
2,track_1400502,02/1400502.mp3,piano
3,track_0491052,52/491052.mp3,piano
4,track_0944140,40/944140.mp3,piano


In [8]:
df_jamendo.shape

(1240, 3)

In [9]:
df_jamendo['instrument'].value_counts()

instrument
acousticguitar    305
electricguitar    270
bass              246
drums             157
piano             150
voice             112
Name: count, dtype: int64

In [10]:
JAMENDO = np.load('mtg-jamendo/features.npz', allow_pickle=True)

In [11]:
X, Y_true, track_id = JAMENDO['X'], JAMENDO['Y_mask'], JAMENDO['track_id']

In [13]:
X = X[:6151].copy()

In [14]:
instruments = np.unique(df_jamendo['instrument'])
instruments

array(['piano', 'voice', 'bass', 'electricguitar', 'drums',
       'acousticguitar'], dtype=object)

In [15]:
class_map = {inst: i for i, inst in enumerate(instruments)}
class_map

{'piano': 0,
 'voice': 1,
 'bass': 2,
 'electricguitar': 3,
 'drums': 4,
 'acousticguitar': 5}

## Training MTG-Jamendo

In [16]:
x_split = df_jamendo['track_id']
y_split = df_jamendo['instrument']
x_train, x_test, y_train, y_test = train_test_split(x_split, y_split, test_size=0.2, random_state=42)

In [17]:
print(f'Train: {len(x_train)}\nTest: {len(x_test)}')

Train: 992
Test: 248


In [18]:
train_set = set(x_train)
test_set = set(x_test)

In [19]:
idx_train, idx_test = [], []

for idx, n in enumerate(track_id):
    if n[:-2] in train_set:
        idx_train.append(idx)
    elif n[:-2] in test_set:
        idx_test.append(idx)
        
idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)

In [21]:
X_train = X[idx_train]
X_test = X[idx_test]

Y_train = Y_true[idx_train]
Y_test = Y_true[idx_test]

In [22]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'Y_train shape: {Y_train.shape}')
print(f'Y_test shape: {Y_test.shape}')

X_train shape: (4922, 10, 128)
X_test shape: (1229, 10, 128)
Y_train shape: (4922, 6)
Y_test shape: (1229, 6)


In [25]:
models = dict()

for idx, instrument in enumerate(class_map):
    train_inst = Y_train[:, idx]
    test_inst = Y_test[:, idx]
    
    X_train_inst = X_train[train_inst == 1]  
    X_train_other = X_train[train_inst != 1] 
    X_train_inst_sklearn = np.mean(X_train_inst, axis=1)
    X_train_other_sklearn = np.mean(X_train_other, axis=1)
    
    Y_true_train_inst = np.ones(len(X_train_inst), dtype=bool)  
    Y_true_train_other = np.zeros(len(X_train_other), dtype=bool)  
    Y_true_train = np.concatenate((Y_true_train_inst, Y_true_train_other))
    
    X_test_inst = X_test[test_inst == 1]  
    X_test_other = X_test[test_inst != 1]  
    X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
    X_test_other_sklearn = np.mean(X_test_other, axis=1)
    
    Y_true_test_inst = np.ones(len(X_test_inst), dtype=bool)  
    Y_true_test_other = np.zeros(len(X_test_other), dtype=bool)  
    Y_true_test = np.concatenate((Y_true_test_inst, Y_true_test_other))

    clf = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    
    X_train_combined = np.concatenate((X_train_inst_sklearn, X_train_other_sklearn))
    clf.fit(X_train_combined, Y_true_train)

    Y_pred_train = clf.predict(X_train_combined)
    Y_pred_test = clf.predict(np.concatenate((X_test_inst_sklearn, X_test_other_sklearn)))
    
    print('-' * 52)
    print(instrument)
    print('\tTRAIN')
    print(classification_report(Y_true_train, Y_pred_train))
    print('\tTEST')
    print(classification_report(Y_true_test, Y_pred_test))
    
    models[instrument] = clf

----------------------------------------------------
piano
	TRAIN
              precision    recall  f1-score   support

       False       0.97      1.00      0.98      4316
        True       1.00      0.75      0.86       606

    accuracy                           0.97      4922
   macro avg       0.98      0.88      0.92      4922
weighted avg       0.97      0.97      0.97      4922

	TEST
              precision    recall  f1-score   support

       False       0.95      1.00      0.98      1095
        True       1.00      0.61      0.76       134

    accuracy                           0.96      1229
   macro avg       0.98      0.81      0.87      1229
weighted avg       0.96      0.96      0.95      1229

----------------------------------------------------
voice
	TRAIN
              precision    recall  f1-score   support

       False       0.96      1.00      0.98      4492
        True       1.00      0.51      0.68       430

    accuracy                           0.96 

In [26]:
# models = dict()

# for idx, instrument in enumerate(class_map):
#     train_inst = Y_train[:, idx]
#     test_inst = Y_test[:, idx]
    
#     X_train_inst = X_train[train_inst]
#     X_train_inst_sklearn = np.mean(X_train_inst, axis=1)
#     Y_true_train_inst = np.where(train_inst == 1, True, False)#Y_train[train_inst, idx] >= 0.5
    
#     X_test_inst = X_test[test_inst]
#     X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
#     Y_true_test_inst = Y_test[test_inst, idx] >= 0.5

#     clf = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    
#     clf.fit(X_train_inst_sklearn, Y_true_train_inst)

#     Y_pred_train = clf.predict(X_train_inst_sklearn)
#     Y_pred_test = clf.predict(X_test_inst_sklearn)
    
#     print('-' * 52)
#     print(instrument)
#     print('\tTRAIN')
#     print(classification_report(Y_true_train_inst, Y_pred_train))
#     print('\tTEST')
#     print(classification_report(Y_true_test_inst, Y_pred_test))
    
#     models[instrument] = clf

In [27]:
def remove_silence(y, sr):
    db = librosa.core.amplitude_to_db(y)
    mean_db = np.abs(db).mean()
    splitted_audio = librosa.effects.split(y=y, top_db=mean_db)

    silence_removed = []

    for inter in splitted_audio:
        silence_removed.extend(y[inter[0]:inter[1]])

    return np.array(silence_removed)

In [28]:
def get_segment(segment, split_duration, y, sr, duration):
    start_time = segment * split_duration
    end_time = min((segment + 1) * split_duration, duration)
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    return y[start_sample:end_sample]

In [29]:
def split(y, sr, duration, split_duration=10):
    num_segments = math.ceil(duration/split_duration)
    segments = []

    for segment in range(num_segments - 1):
        splitted_track = get_segment(segment, split_duration, y, sr, duration)
        segments.append(splitted_track)

    return segments

## Avaliando modelos MTG-Jamendo em áudios com separação de instrumentos

#### Áudios separado usando Demucs (Facebook)

In [32]:
audio_drums, sr_drums = librosa.load('raw/drums.mp3')
audio_splitted_drums = remove_silence(audio_drums, sr_drums)
dur = librosa.get_duration(y=audio_splitted_drums, sr=sr_drums)
segments = split(audio_splitted_drums, sr_drums, dur)

In [46]:
correct = 0

for segment in segments:
    _, features = vggish.waveform_to_features(segment, sr_drums)
    feature_mean = np.mean(features, axis=0, keepdims=True)
    probs = []
    
    for instrument in models:
        clf = models[instrument]
        prob = clf.predict_proba(feature_mean)[0,1]
        probs.append((instrument, prob))
        print('P({:18s}=1) = {:.3f}'.format(instrument, prob))
        
    max_prob = max(probs, key=lambda x: x[1])
    
    if max_prob[0] == 'drums':
        correct += 1
    
    print(max_prob)
    print('-' * 15)

print(f'{correct} correct assignments and {len(segments) - correct} wrong.')

INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(piano             =1) = 0.098
P(voice             =1) = 0.029
P(bass              =1) = 0.334
P(electricguitar    =1) = 0.232
P(drums             =1) = 0.255
P(acousticguitar    =1) = 0.064
('bass', 0.3341668657438588)
---------------
INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(piano             =1) = 0.040
P(voice             =1) = 0.026
P(bass              =1) = 0.399
P(electricguitar    =1) = 0.135
P(drums             =1) = 0.227
P(acousticguitar    =1) = 0.035
('bass', 0.39854688599326954)
---------------
INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(piano             =1) = 0.047
P(voice             =1) = 0.036
P(bass              =1) = 0.413
P(electricguitar    =1) = 0.124
P(drums             =1) = 0.232
P(acousticguitar    =1) = 0.027
('bass', 0.4133

In [47]:
audio_bass, sr_bass = librosa.load('raw/bass.mp3')
audio_splitted_bass = remove_silence(audio_bass, sr_bass)
dur = librosa.get_duration(y=audio_splitted_bass, sr=sr_bass)
segments = split(audio_splitted_bass, sr_bass, dur)

In [48]:
correct = 0

for segment in segments:
    _, features = vggish.waveform_to_features(segment, sr_bass)
    feature_mean = np.mean(features, axis=0, keepdims=True)
    probs = []
    
    for instrument in models:
        clf = models[instrument]
        prob = clf.predict_proba(feature_mean)[0,1]
        probs.append((instrument, prob))
        print('P({:18s}=1) = {:.3f}'.format(instrument, prob))
        
    max_prob = max(probs, key=lambda x: x[1])
    
    if max_prob[0] == 'bass':
        correct += 1
    
    print(max_prob)
    print('-' * 15)

print(f'{correct} correct assignments and {len(segments) - correct} wrong.')

INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(piano             =1) = 0.017
P(voice             =1) = 0.024
P(bass              =1) = 0.272
P(electricguitar    =1) = 0.452
P(drums             =1) = 0.169
P(acousticguitar    =1) = 0.109
('electricguitar', 0.4519467530455954)
---------------
INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(piano             =1) = 0.037
P(voice             =1) = 0.030
P(bass              =1) = 0.213
P(electricguitar    =1) = 0.392
P(drums             =1) = 0.151
P(acousticguitar    =1) = 0.105
('electricguitar', 0.3923828644316695)
---------------
INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(piano             =1) = 0.074
P(voice             =1) = 0.044
P(bass              =1) = 0.190
P(electricguitar    =1) = 0.400
P(drums             =1) = 0.156
P(acousticguitar    =1) = 0.

## Avaliando modelos Openmic em áudios com separação de instrumentos

In [52]:
audio_drums, sr_drums = librosa.load('raw/drums.mp3')
audio_splitted_drums = remove_silence(audio_drums, sr_drums)
dur = librosa.get_duration(y=audio_splitted_drums, sr=sr_drums)
segments = split(audio_splitted_drums, sr_drums, dur)

In [54]:
correct = 0

for segment in segments:
    _, features = vggish.waveform_to_features(segment, sr_drums)
    feature_mean = np.mean(features, axis=0, keepdims=True)
    probs = []
    
    for instrument in instruments_openmic:
        with open(f'openmic/models/{instrument}.pkl', 'rb') as f:
            clf = pickle.load(f)
        prob = clf.predict_proba(feature_mean)[0,1]
        probs.append((instrument, prob))
        print('P({:18s}=1) = {:.3f}'.format(instrument, prob))
        
    max_prob = max(probs, key=lambda x: x[1])
    
    if max_prob[0] == 'drums':
        correct += 1
    
    print(max_prob)
    print('-' * 15)

print(f'{correct} correct assignments and {len(segments) - correct} wrong.')

INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(accordion         =1) = 0.039
P(banjo             =1) = 0.085
P(bass              =1) = 0.627
P(cello             =1) = 0.027
P(clarinet          =1) = 0.054
P(cymbals           =1) = 0.912
P(drums             =1) = 0.930
P(flute             =1) = 0.068
P(guitar            =1) = 0.389
P(mallet_percussion =1) = 0.216
P(mandolin          =1) = 0.112
P(organ             =1) = 0.070
P(piano             =1) = 0.200
P(saxophone         =1) = 0.314
P(synthesizer       =1) = 0.564
P(trombone          =1) = 0.238
P(trumpet           =1) = 0.259
P(ukulele           =1) = 0.089
P(violin            =1) = 0.084
P(voice             =1) = 0.101
('drums', 0.9301910340249472)
---------------
INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(accordion         =1) = 0.039
P(banjo             =1) = 0.060
P(bass              =1) = 0.539
P(cell

In [55]:
audio_bass, sr_bass = librosa.load('raw/bass.mp3')
audio_splitted_bass = remove_silence(audio_bass, sr_bass)
dur = librosa.get_duration(y=audio_splitted_bass, sr=sr_bass)
segments = split(audio_splitted_bass, sr_bass, dur)

In [56]:
correct = 0

for segment in segments:
    _, features = vggish.waveform_to_features(segment, sr_drums)
    feature_mean = np.mean(features, axis=0, keepdims=True)
    probs = []
    
    for instrument in instruments_openmic:
        with open(f'openmic/models/{instrument}.pkl', 'rb') as f:
            clf = pickle.load(f)
        prob = clf.predict_proba(feature_mean)[0,1]
        probs.append((instrument, prob))
        print('P({:18s}=1) = {:.3f}'.format(instrument, prob))
        
    max_prob = max(probs, key=lambda x: x[1])
    
    if max_prob[0] == 'bass':
        correct += 1
    
    print(max_prob)
    print('-' * 15)

print(f'{correct} correct assignments and {len(segments) - correct} wrong.')

INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(accordion         =1) = 0.019
P(banjo             =1) = 0.129
P(bass              =1) = 0.793
P(cello             =1) = 0.061
P(clarinet          =1) = 0.069
P(cymbals           =1) = 0.698
P(drums             =1) = 0.579
P(flute             =1) = 0.015
P(guitar            =1) = 0.885
P(mallet_percussion =1) = 0.100
P(mandolin          =1) = 0.145
P(organ             =1) = 0.022
P(piano             =1) = 0.111
P(saxophone         =1) = 0.099
P(synthesizer       =1) = 0.559
P(trombone          =1) = 0.124
P(trumpet           =1) = 0.168
P(ukulele           =1) = 0.229
P(violin            =1) = 0.090
P(voice             =1) = 0.140
('guitar', 0.8850771053485441)
---------------
INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt
P(accordion         =1) = 0.009
P(banjo             =1) = 0.117
P(bass              =1) = 0.827
P(cel

## Avaliando modelos Openmic em áudios do MTG-Jamendo

In [58]:
for idx, sample in enumerate(X):
    probs = []
    feature_mean = np.mean(sample, axis=0, keepdims=True)
    for instrument in instruments_openmic:
        with open(f'openmic/models/{instrument}.pkl', 'rb') as f:
            clf = pickle.load(f)
        probs.append((instrument, clf.predict_proba(feature_mean)[0,1]))
    if idx % 100 == 0:
        print(f'track {idx}: {max(probs, key=lambda x: x[1])}')

track 0: ('piano', 0.9819445785447907)
track 100: ('cymbals', 0.5920993466658403)
track 200: ('piano', 0.8643403439229549)
track 300: ('piano', 0.9940327501613011)
track 400: ('synthesizer', 0.864822353034603)
track 500: ('piano', 0.994858591704815)
track 600: ('piano', 0.9948073971655659)
track 700: ('piano', 0.9947824246173812)
track 800: ('voice', 0.9280267046538466)
track 900: ('synthesizer', 0.9476000206876471)
track 1000: ('voice', 0.8778806001227599)
track 1100: ('voice', 0.5869546830646616)
track 1200: ('synthesizer', 0.7289733561604167)
track 1300: ('guitar', 0.9881444147002345)
track 1400: ('drums', 0.8375779812204818)
track 1500: ('cymbals', 0.7576966583998561)
track 1600: ('drums', 0.8027051484851206)
track 1700: ('drums', 0.9359719082508033)
track 1800: ('guitar', 0.9235576093604213)
track 1900: ('guitar', 0.9868575932639908)
track 2000: ('guitar', 0.962082590706353)
track 2100: ('guitar', 0.9124725820366361)
track 2200: ('cymbals', 0.643591259233755)
track 2300: ('guitar'