In [1]:
import os
import sys
import math
import librosa
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
import soundfile as sf
import vggish.vggish as vggish
import IPython.display as ipd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

2023-06-02 11:08:17.209543: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-02 11:08:18.740338: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-02 11:08:18.744914: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
warnings.filterwarnings('ignore')

# OPENMIC

In [99]:
df = pd.read_csv('raw/openmic-2018-aggregated-labels.csv')
df.head()

Unnamed: 0,sample_key,instrument,relevance,num_responses
0,000046_3840,clarinet,0.17105,3
1,000046_3840,flute,0.0,3
2,000046_3840,trumpet,0.0,3
3,000135_483840,saxophone,0.14705,3
4,000135_483840,voice,1.0,3


In [100]:
df.shape

(41534, 4)

In [101]:
instruments = ['bass', 'drums', 'guitar', 'piano', 'synthesizer', 'voice']

In [102]:
df_sample = df[df['instrument'].isin(instruments)]
df_sample.drop(columns=['num_responses'], inplace=True)
df_sample.reset_index(inplace=True)
df_sample.head()

Unnamed: 0,index,sample_key,instrument,relevance
0,4,000135_483840,voice,1.0
1,13,000178_3840,voice,1.0
2,14,000182_145920,piano,0.0
3,15,000182_145920,voice,1.0
4,16,000189_207360,guitar,0.16665


In [103]:
df_sample['instrument'].value_counts()

instrument
bass           1900
drums          1759
piano          1733
guitar         1664
synthesizer    1614
voice          1576
Name: count, dtype: int64

In [104]:
OPENMIC = np.load('raw/openmic-2018.npz', allow_pickle=True)

In [105]:
X, Y_true, Y_mask, sample_key = OPENMIC['X'], OPENMIC['Y_true'], OPENMIC['Y_mask'], OPENMIC['sample_key']

In [106]:
X.shape

(20000, 10, 128)

In [107]:
instruments.sort()

In [108]:
class_map = {inst: i for i, inst in enumerate(instruments)}
class_map

{'bass': 0, 'drums': 1, 'guitar': 2, 'piano': 3, 'synthesizer': 4, 'voice': 5}

In [109]:
split_train = pd.read_csv('raw/partitions/split01_train.csv', header=None).squeeze()
split_test = pd.read_csv('raw/partitions/split01_test.csv', header=None).squeeze()

split_train = split_train[split_train.isin(df_sample['sample_key'])]
split_test = split_test[split_test.isin(df_sample['sample_key'])]

In [110]:
print(f'Train: {len(split_train)}\nTest: {len(split_test)}')

Train: 6189
Test: 2028


In [111]:
train_set = set(split_train)
test_set = set(split_test)

In [112]:
idx_train, idx_test = [], []

for idx, n in enumerate(sample_key):
    if n in train_set:
        idx_train.append(idx)
    elif n in test_set:
        idx_test.append(idx)
        
idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)

In [113]:
X_train = X[idx_train]
X_test = X[idx_test]

Y_true_train = Y_true[idx_train]
Y_true_test = Y_true[idx_test]

Y_mask_train = Y_mask[idx_train]
Y_mask_test = Y_mask[idx_test]

In [114]:
print(X_train.shape)
print(X_test.shape)
print(Y_true_train.shape)
print(Y_true_test.shape)
print(Y_mask_train.shape)
print(Y_mask_test.shape)

(6189, 10, 128)
(2028, 10, 128)
(6189, 20)
(2028, 20)
(6189, 20)
(2028, 20)


In [115]:
models = dict()

for instrument in class_map:
    
    inst_num = class_map[instrument]

    train_inst = Y_mask_train[:, inst_num]
    test_inst = Y_mask_test[:, inst_num]
    
    X_train_inst = X_train[train_inst]
    X_train_inst_sklearn = np.mean(X_train_inst, axis=1)
    Y_true_train_inst = Y_true_train[train_inst, inst_num] >= 0.8
    
    X_test_inst = X_test[test_inst]
    X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
    Y_true_test_inst = Y_true_test[test_inst, inst_num] >= 0.8

    clf = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    
    clf.fit(X_train_inst_sklearn, Y_true_train_inst)

    Y_pred_train = clf.predict(X_train_inst_sklearn)
    Y_pred_test = clf.predict(X_test_inst_sklearn)
    
    print('-' * 52)
    print(instrument)
    print('\tTRAIN')
    print(classification_report(Y_true_train_inst, Y_pred_train))
    print('\tTEST')
    print(classification_report(Y_true_test_inst, Y_pred_test))
    
    models[instrument] = clf

----------------------------------------------------
bass
	TRAIN
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       427
        True       1.00      1.00      1.00        52

    accuracy                           1.00       479
   macro avg       1.00      1.00      1.00       479
weighted avg       1.00      1.00      1.00       479

	TEST
              precision    recall  f1-score   support

       False       0.93      0.99      0.96       143
        True       0.93      0.58      0.72        24

    accuracy                           0.93       167
   macro avg       0.93      0.79      0.84       167
weighted avg       0.93      0.93      0.93       167

----------------------------------------------------
drums
	TRAIN
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       328
        True       1.00      1.00      1.00        89

    accuracy                           1.00  

# MTG-Jamendo

In [120]:
df_jamendo = pd.read_csv('mtg-jamendo/selected-instruments.csv')
df_jamendo.head()

Unnamed: 0,track_id,path,instrument
0,track_0151154,54/151154.mp3,piano
1,track_1142627,27/1142627.mp3,piano
2,track_1400502,02/1400502.mp3,piano
3,track_0491052,52/491052.mp3,piano
4,track_0944140,40/944140.mp3,piano


In [121]:
df_jamendo['instrument'].value_counts()

instrument
acousticguitar    305
electricguitar    270
bass              246
drums             157
piano             150
voice             112
Name: count, dtype: int64

In [122]:
JAMENDO = np.load('mtg-jamendo/new_features.npz', allow_pickle=True)

In [123]:
X, y, track_id = JAMENDO['X'], JAMENDO['Y_mask'], JAMENDO['track_id']

In [124]:
X = copied_array = X[:6151].copy()

In [125]:
instruments = df_jamendo['instrument'].unique()
instruments

array(['piano', 'voice', 'bass', 'electricguitar', 'drums',
       'acousticguitar'], dtype=object)

In [126]:
class_map = {inst: i for i, inst in enumerate(instruments)}
class_map

{'piano': 0,
 'voice': 1,
 'bass': 2,
 'electricguitar': 3,
 'drums': 4,
 'acousticguitar': 5}

## Training MTG-Jamendo

In [127]:
x_split = df_jamendo['track_id']
y_split = df_jamendo['instrument']
x_train, x_test, y_train, y_test = train_test_split(x_split, y_split, test_size=0.2, random_state=42)

In [128]:
print(f'Train: {len(x_train)}\nTest: {len(x_test)}')

Train: 992
Test: 248


In [129]:
train_set = set(x_train)
test_set = set(x_test)

In [130]:
idx_train, idx_test = [], []

for idx, n in enumerate(track_id):
    if n[:-2] in train_set:
        idx_train.append(idx)
    elif n[:-2] in test_set:
        idx_test.append(idx)
        
idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)

In [131]:
X_train = X[idx_train]
X_test = X[idx_test]

Y_train = Y_mask[idx_train]
Y_test = Y_mask[idx_test]

In [132]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'Y_train shape: {Y_mask_train.shape}')
print(f'Y_test shape: {Y_mask_test.shape}')

X_train shape: (4922, 10, 128)
X_test shape: (1229, 10, 128)
Y_train shape: (6189, 20)
Y_test shape: (2028, 20)


In [133]:
Y_train[0]

array([False, False, False, False,  True, False, False,  True, False,
       False, False, False, False, False, False, False,  True, False,
       False, False])

In [134]:
print(X_train.shape)
print(X_test.shape)

(4922, 10, 128)
(1229, 10, 128)


In [135]:
models = dict()

for idx, instrument in enumerate(class_map):
    train_inst = Y_train[:, idx]
    test_inst = Y_test[:, idx]
    
    X_train_inst = X_train[train_inst == 1]  
    X_train_other = X_train[train_inst != 1] 
    X_train_inst_sklearn = np.mean(X_train_inst, axis=1)
    X_train_other_sklearn = np.mean(X_train_other, axis=1)
    
    Y_true_train_inst = np.ones(len(X_train_inst), dtype=bool)  
    Y_true_train_other = np.zeros(len(X_train_other), dtype=bool)  
    Y_true_train = np.concatenate((Y_true_train_inst, Y_true_train_other))
    
    X_test_inst = X_test[test_inst == 1]  
    X_test_other = X_test[test_inst != 1]  
    X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
    X_test_other_sklearn = np.mean(X_test_other, axis=1)
    
    Y_true_test_inst = np.ones(len(X_test_inst), dtype=bool)  
    Y_true_test_other = np.zeros(len(X_test_other), dtype=bool)  
    Y_true_test = np.concatenate((Y_true_test_inst, Y_true_test_other))

    clf = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    
    X_train_combined = np.concatenate((X_train_inst_sklearn, X_train_other_sklearn))
    clf.fit(X_train_combined, Y_true_train)

    Y_pred_train = clf.predict(X_train_combined)
    Y_pred_test = clf.predict(np.concatenate((X_test_inst_sklearn, X_test_other_sklearn)))
    
    print('-' * 52)
    print(instrument)
    print('\tTRAIN')
    print(classification_report(Y_true_train, Y_pred_train))
    print('\tTEST')
    print(classification_report(Y_true_test, Y_pred_test))
    
    models[instrument] = clf

----------------------------------------------------
piano
	TRAIN
              precision    recall  f1-score   support

       False       0.90      1.00      0.94      4403
        True       1.00      0.01      0.02       519

    accuracy                           0.90      4922
   macro avg       0.95      0.50      0.48      4922
weighted avg       0.91      0.90      0.85      4922

	TEST
              precision    recall  f1-score   support

       False       0.88      1.00      0.94      1086
        True       0.00      0.00      0.00       143

    accuracy                           0.88      1229
   macro avg       0.44      0.50      0.47      1229
weighted avg       0.78      0.88      0.83      1229

----------------------------------------------------
voice
	TRAIN
              precision    recall  f1-score   support

       False       0.86      1.00      0.93      4238
        True       1.00      0.02      0.05       684

    accuracy                           0.86 

In [147]:
# models = dict()

# for idx, instrument in enumerate(class_map):
#     train_inst = Y_train[:, idx]
#     test_inst = Y_test[:, idx]
    
#     X_train_inst = X_train[train_inst]
#     X_train_inst_sklearn = np.mean(X_train_inst, axis=1)
#     Y_true_train_inst = np.where(train_inst == 1, True, False)#Y_train[train_inst, idx] >= 0.5
    
#     X_test_inst = X_test[test_inst]
#     X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
#     Y_true_test_inst = Y_test[test_inst, idx] >= 0.5

#     clf = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    
#     clf.fit(X_train_inst_sklearn, Y_true_train_inst)

#     Y_pred_train = clf.predict(X_train_inst_sklearn)
#     Y_pred_test = clf.predict(X_test_inst_sklearn)
    
#     print('-' * 52)
#     print(instrument)
#     print('\tTRAIN')
#     print(classification_report(Y_true_train_inst, Y_pred_train))
#     print('\tTEST')
#     print(classification_report(Y_true_test_inst, Y_pred_test))
    
#     models[instrument] = clf

In [117]:
def remove_silence(y, sr):
    db = librosa.core.amplitude_to_db(y)
    mean_db = np.abs(db).mean()
    splitted_audio = librosa.effects.split(y=y, top_db=mean_db)

    silence_removed = []

    for inter in splitted_audio:
        silence_removed.extend(y[inter[0]:inter[1]])

    return np.array(silence_removed)

In [118]:
def get_segment(segment, split_duration, y, sr, duration):
    start_time = segment * split_duration
    end_time = min((segment + 1) * split_duration, duration)
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    return y[start_sample:end_sample]

In [119]:
def split(y, sr, duration, split_duration=10):
    num_segments = math.ceil(duration/split_duration)
    segments = []

    for segment in range(num_segments - 1):
        splitted_track = get_segment(segment, split_duration, y, sr, duration)
        segments.append(splitted_track)

    return segments

In [137]:
audio, sr = librosa.load('raw/drums.mp3')

In [139]:
audio_splitted = remove_silence(audio, sr)
dur = librosa.get_duration(y=audio_splitted, sr=sr)
segments = split(audio_splitted, sr, dur)
time_points, features = vggish.waveform_to_features(segments[0], sr)

2023-06-02 12:59:19.252776: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt


2023-06-02 12:59:22.661459: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
2023-06-02 12:59:28.901089: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 201326592 exceeds 10% of free system memory.
2023-06-02 12:59:36.555863: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 67108864 exceeds 10% of free system memory.


In [141]:
ipd.Audio(data=segments[0], rate=sr)

In [142]:
feature_mean = np.mean(features, axis=0, keepdims=True)

for instrument in models:
    clf = models[instrument]
    print('P[{:18s}=1] = {:.3f}'.format(instrument, clf.predict_proba(feature_mean)[0,1]))

P[piano             =1] = 0.135
P[voice             =1] = 0.111
P[bass              =1] = 0.091
P[electricguitar    =1] = 0.070
P[drums             =1] = 0.126
P[acousticguitar    =1] = 0.086


In [143]:
audio, sr = librosa.load('raw/bass.mp3')

In [144]:
audio_splitted = remove_silence(audio, sr)
dur = librosa.get_duration(y=audio_splitted, sr=sr)
segments = split(audio_splitted, sr, dur)
time_points, features = vggish.waveform_to_features(segments[0], sr)

INFO:tensorflow:Restoring parameters from /home/gabs/Documents/tcc/vggish/vggish/_model/vggish_model.ckpt


2023-06-02 13:04:01.067545: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 201326592 exceeds 10% of free system memory.
2023-06-02 13:04:09.052362: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 67108864 exceeds 10% of free system memory.


In [145]:
ipd.Audio(data=segments[0], rate=sr)

In [146]:
feature_mean = np.mean(features, axis=0, keepdims=True)

for instrument in models:
    
    clf = models[instrument]
    
    print('P[{:18s}=1] = {:.3f}'.format(instrument, clf.predict_proba(feature_mean)[0,1]))

P[piano             =1] = 0.093
P[voice             =1] = 0.122
P[bass              =1] = 0.082
P[electricguitar    =1] = 0.078
P[drums             =1] = 0.112
P[acousticguitar    =1] = 0.072
