**Importing libraries**

In [2]:
import random
import os
import numpy as np
from scipy.io import wavfile as wav
from scipy.signal import spectrogram
from librosa.feature import melspectrogram, mfcc
# had to uses pip to install librosa
import matplotlib.pyplot as plt
import itertools

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

**Defining path** 

In [3]:
path = os.getcwd() + '\\data\\'
print(path)

c:\Users\Emili\DSIM_project\ID\data\


In [4]:
def load_data_feature_selection(feature_extractor, train_size, val_split):
  
  # Note: train size must be the same as in load_data that will be used in ML.ipynb

  X_train2 = []
  y_train2 = []

  X_val = []
  y_val = []

  random.seed(10) # For reproducibility

  for speaker in os.listdir(path):
    tracks = os.listdir(path + speaker)
    random.shuffle(tracks) # We don't want the first seconds to systematically
                           # be train and the last to be test
    track_num = 0
    for track in tracks:
      track_num = track_num + 1
      _, signal = wav.read(path + speaker + '/' + track)

      if track_num <=np.floor(train_size*len(tracks)):
        if track_num <=np.floor(train_size*len(tracks)*(1-val_split)):
          X_train2.append(feature_extractor(signal))
          y_train2.append(speaker)
        else:
          X_val.append(feature_extractor(signal))
          y_val.append(speaker)
          
  eps = 0.001
  X_train2 = np.array(X_train2)
  X_train2_mean = X_train2.mean(axis=0)
  X_train2_std = X_train2.std(axis=0)
  X_train2 = (X_train2 - X_train2_mean + eps)/(X_train2_std + eps)
  X_train2 = [row for row in X_train2]
  X_val = [row for row in (np.array(X_val) - X_train2_mean + eps)/(X_train2_std + eps)]


  return X_train2, X_val, y_train2, y_val

## **Features**

**Temporal features**

We are going to use all these features since they are all scalar (-> they don't slow the training down compared to the frequency features)

In [5]:
def energy(input):
    return np.sum((input*1.0)**2, keepdims=True)

In [6]:
def standard_dev(input):
    return np.std(input, keepdims=True)

In [7]:
def zcr (input):
  k=0
  for i in range(0, len(input)-1):
    if input[i]*input[i+1]<0:
      k=k+1

  return np.array(k, ndmin = 1)

**Frequency features**

We are going to apply a selection process for these features since they can be pretty "heavy" computationally-wise

In [8]:
def feats_spectrogram(input, rate = 8000):
  _, _, spec = spectrogram(input, fs = rate)
  out_spec = spec.flatten()
  return out_spec

In [9]:
def feats_melspec(input, rate = 8000):
  input = melspectrogram(y = input*1.0, sr = rate)
  output = input.flatten()
  return output

In [10]:
def feats_mfcc(input, rate = 8000):
  input = mfcc(y = input*1.0, sr = rate)
  output = input.flatten()
  return output

## **Feature selection**

We are going to use the random forest classifier to select the most important features because it is generally a fast and robust method

In [11]:
function_set = {feats_melspec, feats_mfcc, feats_spectrogram}

for combo in range(len(function_set) + 1):
    for subset in itertools.combinations(function_set, combo):
        
        def combo(input):
            return np.concatenate([standard_dev(input),energy(input), zcr(input)] +
                                  [f(input) for f in subset])
        
        X_train2, X_val, y_train2, y_val = load_data_feature_selection(feature_extractor = combo, train_size = 0.8,
                                                     val_split = 0.3)
        
        
        model = RandomForestClassifier(random_state=10)
        model.fit(X_train2, y_train2)

        predictions = model.predict(X_val)
        

        print('\n\nFrequency features used:', [element.__name__ for element in subset])

        # Accuracy (on train2 set)
        print(f"\nAccuracy on train set: {model.score(X_train2, y_train2)}")


        # Accuracy (on val set)
        print(f"\nAccuracy on val set: {model.score(X_val, y_val)}")



Frequency features used: []

Accuracy on train set: 0.9998114274938714

Accuracy on val set: 0.26359649122807016


Frequency features used: ['feats_spectrogram']

Accuracy on train set: 0.9998114274938714

Accuracy on val set: 0.7469298245614036


Frequency features used: ['feats_mfcc']

Accuracy on train set: 0.9998114274938714

Accuracy on val set: 0.8714912280701754


Frequency features used: ['feats_melspec']

Accuracy on train set: 0.9998114274938714

Accuracy on val set: 0.8793859649122807


Frequency features used: ['feats_spectrogram', 'feats_mfcc']

Accuracy on train set: 0.9998114274938714

Accuracy on val set: 0.8114035087719298


Frequency features used: ['feats_spectrogram', 'feats_melspec']

Accuracy on train set: 0.9998114274938714

Accuracy on val set: 0.8460526315789474


Frequency features used: ['feats_mfcc', 'feats_melspec']

Accuracy on train set: 0.9998114274938714

Accuracy on val set: 0.8833333333333333


Frequency features used: ['feats_spectrogram', 'feats_m

Frequency features are essential (without them validation accuracy is only 0.26). Best results are obtained by only using **feats_melspec** as frequency feature

In [13]:
def combo(input):
  return np.concatenate((standard_dev(input),energy(input), zcr(input), feats_melspec(input)))

The number of features exceeds the number of instances, that can cause overfitting. Let's try **cross-validation** (still with the random forest classifier) to make sure that the model is not overfitting (doesn't look like it by using the train-test split but it could happen that the test set randomly has a similar distribution to the train set)

In [14]:
def load_data_cv(feature_extractor, k_folds):

  X= [[] for i in range(k_folds)]
  y= [[] for i in range(k_folds)]
  
  output = []

  random.seed(10) # For reproducibility

  random.shuffle(os.listdir(path))
  for speaker in os.listdir(path):
    tracks = os.listdir(path + speaker)
    random.shuffle(tracks) # We don't want the first seconds to systematically
                           # be train and the last to be test
                           
                           
    split = int(len(tracks)//k_folds)
    k_splits = [split*i for i in range(k_folds)]
    k_splits = k_splits + [len(tracks)]
    track_num = 0
    for track in tracks:
      
      _, signal = wav.read(path + speaker + '/' + track)

      for j in range(k_folds):
        if track_num in range(k_splits[j],k_splits[j+1]):
          X[j].append(feature_extractor(signal))
          y[j].append(speaker)
          
      track_num = track_num + 1
          
  eps = 0.001
  
  for j in range(k_folds):
    
    # Normalizing  input
    X_test = np.array(X[j])
    X_train_list_of_lists = [X[h] for h in range(k_folds) if h!=j]
    X_train = np.array([item for sublist in X_train_list_of_lists for item in sublist])
    X_train_mean = X_train.mean(axis=0)
    X_train_std = X_train.std(axis=0)
    X_train = (X_train - X_train_mean + eps)/(X_train_std + eps)
    X_train = [row for row in X_train]
    X_test = [row for row in (np.array(X_test) - X_train_mean + eps)/(X_train_std + eps)]
    
    y_test = np.array(y[j])
    y_train_list_of_lists = [y[h] for h in range(k_folds) if h!=j]
    y_train = [item for sublist in y_train_list_of_lists for item in sublist]
    
    print(j, '-th split', len(X_train), len(X_test), '\n\n')
    
    output.append([X_train, X_test, y_train, y_test])


  return output

In [15]:
k_folds = 5

cv_data = load_data_cv(combo, k_folds)

print('Loading completed')

train_acc = 0
test_acc = 0

for i in range(k_folds):

    model = RandomForestClassifier(random_state=10)
    model.fit(cv_data[i][0], cv_data[i][2])

    predictions = model.predict(cv_data[i][1])

    train_acc = train_acc + model.score(cv_data[i][0], cv_data[i][2])


    test_acc = test_acc + model.score(cv_data[i][1], cv_data[i][3])
    

print(f"\nAccuracy on train set: {train_acc/k_folds}")
print(f"\nAccuracy on test set: {test_acc/k_folds}")

0 -th split 7607 1895 


1 -th split 7607 1895 


2 -th split 7607 1895 


3 -th split 7607 1895 


4 -th split 7580 1922 


Loading completed

Accuracy on train set: 0.9998947400550735

Accuracy on test set: 0.8885702283516237


As we can see an accuracy of **0.89** is achieved, meaning that 'curse of dimensionality' is not a problem here