In [1]:
# !pip install sklearn

In [2]:
### TO RUN
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
from scipy import signal
import sounddevice as sd

"Machine learning tools"
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.metrics import recall_score 
import pickle

"Self created functions"
from utils_ import getclass, getname, gen_allpath, plot_audio, plot_specgram, get_accuracy, show_confusion_matrix, plot_decision_boundaries
from AudioUtil_And_Dataset_student import AudioUtil, SoundDS

In [3]:
np.random.seed(0)

Useful functions to select, read and play the dataset sounds are provided in the ``utils_`` and ``AudioUtil_And_Dataset`` folder. <br>

As for the H1, you will have to fill some short pieces of code, as well as answer some questions. We already created cells for you to answer the questions to ensure you don't forget it ;). <br>
You will find the zones to be briefly filled  with a ``### TO COMPLETE`` in the cells below.

<font size=6 color=#009999> 2. Training and Evaluating models on audio signals [~1h30-2h] </font> <br>

In [4]:
### TO COMPLETE
path2dataset = "C:/Users/MaudTS/Documents/ESC-50" # Write your path to the dataset here!
allclassnames, allpath_mat = gen_allpath(path2dataset)

"Select only some classes for the classification"
sel_class = [12,14,40,41,49] 
nonsel_class = np.delete(np.arange(allpath_mat.shape[0]), sel_class)

allpath_sel = np.array([allpath_mat[idx,:] for idx in sel_class])
allpath_nonsel = np.array([allpath_mat[idx,:] for idx in nonsel_class])
classnames = np.array([allclassnames[idx] for idx in sel_class])
sel_class_ids = np.arange(len(sel_class))
all_sel_class_ids = np.repeat(sel_class_ids, 40)
data_path = allpath_sel.reshape(-1)

print('The selected classes are {}'.format(classnames))

FileNotFoundError: [WinError 3] Le chemin d’accès spécifié est introuvable: 'C:/Users/MaudTS/Documents/ESC-50'

In H1, it was not made explicit what we choose as input for the classification model, a.k.a. ``feature vector`` (it was shown in the illustration). <br>
The objective is, on the transmitter side, to compute a feature vector containing enough information about the audio signal we want to classify, but not too much in order to limit the data which has to be transmitted wirelessly. This is why in H1 we implemented the ``Hz2Mel`` conversion, a very simple compression of the frequency content. <br>
The feature vector we will use here simply consists in taking the first 10 columns of the melspectrogram, corresponding to ~0.5s, then reshaping it as a vector. This means each feature vector contains ``200`` coefficients, with 20 mels for 10 columns.  <br>

Once the feature vector has been recovered on the receiver side, we can apply any computation on it to guess the right class this sound belongs to. For today, we will simply reuse the simple KNN and LDA classifiers and look at what we already get.  

<font size=5 color=#009999> 2.1. Creation of the dataset </font> <br>

``SoundDS`` is a class defined in ``AudioUtil_And_Dataset.py``. <br>
The functions ``__len__`` and ``__getitem__`` are implemented, meaning you can call :
- ``len(myds)`` to get the number of sounds in it.
- ``myds[i][j]`` to get the melspectrogram of the ``j``-th sound from class ``i``. <br>

Two other useful functions are provided:
- ``get_audiosignal`` returning the temporal audiosignal at the specified index.
- ``display`` playing the sound and showing the associated mel-spectrogram at the specified index.

<font size=3 color=#FF0000> Important :</font> <br>
Before being able to run the cells below, you will have to reuse your functions from H1 to fill the missing lines in ``AudioUtil_And_Dataset.py`` at ``###TO COMPLETE`` locations.

In [None]:
### TO RUN
"Creation of the dataset"
myds = SoundDS(all_sel_class_ids, Nft=512, nmel=20, duration=750, shift_pct=0.0, data_path=data_path, allpath_mat=allpath_mat)

"Some attributes..."
myds.nmel
myds.allpath_mat
myds.class_ids
myds.data_path
myds.duration
myds.shift_pct
myds.sr
myds.data_aug
myds.ncol

idx = 51
myds.display(idx)

Running the cell above many time, you should notice it is always the beginning of the sound that is taken for creating the feature vector. ``shift_pct`` meaning *shift percentage* allows to roll the audio signal with a random factor upper bounded by this value. Change ``shift_pct`` to ``0.2`` and observe what happens.
### ???

In [None]:
### TO RUN
"Random split of 70:30 between training and validation"
train_pct = 0.7

featveclen = len(myds[0][0]) # number of items in a feature vector
num_items = len(myds) # number of sounds in the dataset
num_sounds = len(myds.allpath_mat[0,:]) # number of sounds in each class
num_classes = int(num_items/num_sounds) # number of classes
num_learn = round(num_sounds * train_pct) # number of sounds among num_sounds for training 

data_aug_factor = 1
sel_class_ids = np.arange(len(sel_class))
sel_class_ids_aug = np.repeat(sel_class_ids, num_sounds*data_aug_factor)

"Compute the matrixed dataset, this takes some seconds, but you can then reload it by commenting this loop and decommenting the np.load below"
X = np.zeros((data_aug_factor*num_classes*num_sounds, featveclen))
for s in range(data_aug_factor):
    for i, featvec in enumerate(myds):
        X[s*num_classes*num_sounds+i,:] = featvec[0]
np.save("feature_matrix_2D.npy", X)

# X = np.load("feature_matrix_2D.npy")

"Labels"
y = sel_class_ids_aug.copy()

print('Shape of the feature matrix : {}'.format(X.shape))
print('Number of labels : {}'.format(len(y)))

print('Remember the convention shown for the toy example, the feature vectors are arranged on the rows.')

<font size=5 color=#009999> 2.2. Classification </font> <br>

For now we have only prepared the dataset, it remains to feed it to the classifiers. <br>

In [None]:
### TO RUN
K = 10 # Number of neighbours for the KNN
model_knn = KNeighborsClassifier(n_neighbors=K, weights='distance', algorithm='auto', metric='minkowski') #We explicitly write the default parameters of this KNN classifier once so that you know they exist and can be changed

model_lda = LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001, covariance_estimator=None) #We explicitly write the default parameters of this LDA classifier once so that you know they exist and can be changed

As for the toy example, we keep the ``accuracy`` and ``confusion matrix`` as performance metrics.

In [None]:
### TO RUN
"Shuffle then split the dataset into training and testing subsets"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) # random_state=1
print('Shape of the training matrix : {}'.format(X_train.shape))
print('Number of training labels : {}'.format(len(y_train)))

model_knn.fit(X_train, y_train)
model_lda.fit(X_train, y_train)

prediction_knn = model_knn.predict(X_test)
prediction_lda = model_lda.predict(X_test)
accuracy_knn = get_accuracy (prediction_knn, y_test)
accuracy_lda = get_accuracy (prediction_lda, y_test)

print('Accuracy of KNN with fixed train/validation sets : {:.1f}%'.format(100*accuracy_knn))
show_confusion_matrix (prediction_knn, y_test, classnames)
print('Accuracy of LDA with fixed train/validation sets : {:.1f}%'.format(100*accuracy_lda))
show_confusion_matrix (prediction_lda, y_test, classnames)

**Questions**: 
- What would be the expected accuracy if the label predictions were picked at random?
- What do you observe in this confusion matrix? Reapply the ``train_test_split`` and tell if your observations are robust.

### TO COMPLETE
1. 20% 
2. The diagonal values are not high compared to the other values. The results obtained are not very good. When we reapply the ``train_test_split``, the accuracy obtained varies a lot. Our observations are thus not robust. ``train_test_split`` permet de séparer les données sélectionnées pour le LEARNING en training set et validation set et le fait de manière random. Cela a donc comme conséquence qu'on fait le training sur des données différentes à chaque fois, résultant en des résultats de précision différente. 
ATTENTION : KNN ne choisi presque jamais 'HANDSAW'. Raison : voir dicision spatiale des classes, la partie handasw est toute petite ET au centre de toutes les autres. Le classifier va donc facilement classifier les points handsaw dasn des mauvaises catégories. 
On peut d'ailleurs voir plus bas que la précison des deux classificateurs KNN et LDA peut varier fortement. En effet, elles ont une grande variance (std)
Une fois que nous faisons une cross validation, on ne devrait plus avoir ce problème. 
On peut voir que la précision au KNN ne varie plus beaucoup avec le hyperparamètre K. En effet les clusters sont bien répartis (20 dimensions). Dépend de la nature du data.

Play with the index ``ind`` to pick feature vectors in the dataset ``myds``, listen to the audio associated to the feature vector, and check if you would have been able to predict the right class by your own. Then compare with the prediction given by your classifier.

In [None]:
### TO RUN
ind = 1
myds.display(ind)
thisfv = myds[ind][0].reshape(-1)
prediction_knn = model_knn.predict([thisfv])
print('Class predicted by the model:', classnames[prediction_knn][0]) 

Furthermore, when training a model and comparing different settings, there is a risk that we will end up choosing optimal parameters that only renders good result on our specific case of training and validation set, but ``do not generalize well for additional data``. This is called ``overfitting on the validation set``. To alleviate this, we can perform ``cross-validation (CV)``. A basic approach named ``K-fold CV`` involves partitioning the dataset in ``K`` "folds" (subsets) and repetitvely do the following procedure:

- Train the model using `K-1` folds as the training data.
- Test the model using the last fold as the validation data.

The overall performance on each fold is then averaged to obtain the final performance metrics.

In [None]:
### TO RUN
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits,shuffle=True)

accuracy_knn = np.zeros((n_splits,))
accuracy_lda = np.zeros((n_splits,))
for k, idx in enumerate(kf.split(X_train,y_train)):
  (idx_learn, idx_val) = idx
  model_knn.fit(X_train[idx_learn], y_train[idx_learn])
  prediction_knn = model_knn.predict(X_train[idx_val])
  accuracy_knn[k] = get_accuracy(prediction_knn, y_train[idx_val])

  model_lda.fit(X_train[idx_learn], y_train[idx_learn])
  prediction_lda = model_lda.predict(X_train[idx_val])
  accuracy_lda[k] = get_accuracy(prediction_lda, y_train[idx_val])

print('Mean accuracy of KNN with 5-Fold CV: {:.1f}%'.format(100*accuracy_knn.mean()))
print('Std deviation in accuracy of KNN with 5-Fold CV: {:.1f}%'.format(100*accuracy_knn.std()))

print('Mean accuracy of LDA with 5-Fold CV: {:.1f}%'.format(100*accuracy_lda.mean()))
print('Std deviation in accuracy of LDA with 5-Fold CV: {:.1f}%'.format(100*accuracy_lda.std()))

In the upper analysis, we fixed ``K`` for the KNN. This is called an ``hyperparameter`` of the classification model. Let us now have a look at the effect of this hyperparameter!  <br>

In [None]:
### TO RUN
Ks = np.arange(6,50, 1)
accuracies_knn = np.zeros((len(Ks), n_splits))
for i,K in enumerate(Ks):
    model_knn = KNeighborsClassifier(n_neighbors=K, weights='distance') 
    for k, idx in enumerate(kf.split(X_train,y_train)):
            (idx_learn, idx_val) = idx
            model_knn.fit(X_train[idx_learn], y_train[idx_learn])
            prediction_knn = model_knn.predict(X_train[idx_val])
            accuracies_knn[i,k] = get_accuracy(prediction_knn, y_train[idx_val])
means_knn = accuracies_knn.mean(axis=1)
stds_knn = accuracies_knn.std(axis=1)

"Plot"
plt.figure(figsize=(6,3))
plt.plot(Ks, means_knn, '.-b', label='KNN')
plt.fill_between(Ks,means_knn-stds_knn,means_knn+stds_knn,alpha=0.2,color='b')
plt.ylim(0,1)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

<font size=5 color=#009999> 2.5. Scale mismatch and countermeasure </font> <br>

In real conditions, you will most probably have a different scale between the feature vectors used for training (in simulation) and the ones you feed in your model to make predictions.
This scale mismatch between model training and prediction is difficult to prevent because it depends on multiple factors such as the audio source power, its distance to the microphone, the telecommunication distance. <br>

Play with the ``dB_mismatch`` variable here below and observe its effect on the confusion matrix.

In [None]:
### TO RUN
dB_mismatch = -10 # Play with this value default : 0 
# db_mismatch permet de voir si l'amplitude (augementation ou diminution) impacte l'efficacité de la classification
#La réponse est OUI du cp faut normaliser
X_val_scaled = X_train[idx_val]*10**(-dB_mismatch/20)

model_knn = KNeighborsClassifier(n_neighbors=10) 
model_knn.fit(X_train[idx_learn], y_train[idx_learn])
prediction_knn = model_knn.predict(X_val_scaled)
show_confusion_matrix (prediction_knn, y_train[idx_val], classnames)
accuracy_knn = get_accuracy (prediction_knn, y_train[idx_val])
print('Accuracy of KNN: {:.1f}%'.format(100*accuracy_knn))

The simplest countermeasure we can think of is to normalise the feature vector (i.e. unitize its norm) prior to use, both for training and testing. Remember how this normalization could be visualized in ``H3a_toy_student.ipynb`` <br>
Play again with the ``dB_mismatch`` variable here below and observe its effect on the confusion matrix.

In [None]:
### TO RUN
dB_mismatch = -20 # Play with this value --> NE change rien 

X_learn_normalised = X_train[idx_learn]/ np.linalg.norm(X_train[idx_learn], axis=1, keepdims=True)
model_knn = KNeighborsClassifier(n_neighbors=10, weights='distance') 
model_knn.fit(X_learn_normalised, y_train[idx_learn])

X_val_scaled = X_train[idx_val]*10**(-dB_mismatch/20)
X_val_normalised = X_val_scaled/ np.linalg.norm(X_val_scaled, axis=1, keepdims=True)

prediction_knn = model_knn.predict(X_val_normalised)
show_confusion_matrix (prediction_knn, y_train[idx_val], classnames)
accuracy_knn = get_accuracy (prediction_knn, y_train[idx_val])
print('Accuracy of KNN: {:.1f}%'.format(100*accuracy_knn))

**Question**: 
- What will happen with this normalisation countermeasure when there is no sound around the microphone? Is this desirable? How could you deal with it?

### TO COMPLETE
Le bruit peut être considéré comme un signal. On va normaliser le bruit, ce qui n'est pas désirable. Comment résoudre ce problème ? Treshhold
Si on estime qu'on entend rien (énergie, amplitude : seuil sur l'amplitude (chance non nulle que le bruit ait une grande valeur à un moment donné donc travailler avec l'énergie est une meilleure idée)) : alors on ne normalize pas. 

<font size=5 color=#009999> 2.4. Dimensionality reduction </font> <br>

It is sometimes good practice to reduce the dimensionality of a signal in order to get the main components of their distribution. A motivation is that usual norms behave counter-inuitively in high dimension. To reduce the dimensionality, we will use the ``Principal compenent analysis (PCA)`` proposed by sklearn. See the [associated Wikipedia page](https://en.wikipedia.org/wiki/Principal_component_analysis). Recall: the PCA consists in reducing the dimensionality of data vectors encoded in $\boldsymbol X \in \mathbb R^{d\times N}$ to only $p \ll d$ dimensions as

$$
    \boldsymbol Y = \boldsymbol V_p^\top \boldsymbol X \in \mathbb R^{p\times N},
$$

where the SVD of the covariance matrix writes as $\hat{\boldsymbol\Sigma}_{\boldsymbol X} = \frac{1}{d} \boldsymbol{XX}^\top = \boldsymbol{U\Sigma V}^\top$, and $\boldsymbol V_p$ is the subselection of the first $p$ columns of $\boldsymbol V$. 

For our application, reducing the dimensionality of the data can be helpful for compressing the packet size to be transmitted wirelessly. Indeed, once learned during training, $\boldsymbol V_p$ can be hardcoded on the transmitter side.

Starting with a PCA to 2D for visualization, see how hard it is to separate the classes.

In [None]:
### TO RUN
n=2 # Number of principal components kept
pca = PCA(n_components=n,whiten=True)
X_learn_reduced = pca.fit_transform(X_train[idx_learn])
X_val_reduced = pca.transform(X_train[idx_val])

print('Shape of the reduced training matrix : {}'.format(X_learn_reduced.shape))

K = 10
model_knn = KNeighborsClassifier(n_neighbors=K)
model_knn.fit(X_learn_reduced, y_train[idx_learn])
prediction_knn = model_knn.predict(X_val_reduced)
accuracy_knn = get_accuracy(prediction_knn, y_train[idx_val])

model_lda = LDA()
model_lda.fit(X_learn_reduced, y_train[idx_learn])
prediction_lda = model_lda.predict(X_val_reduced)
accuracy_lda = get_accuracy(prediction_lda, y_train[idx_val])

fig = plt.figure()
axs = [fig.add_axes([0.0, 0.0, 0.4, 0.9]), fig.add_axes([0.6, 0.0, 0.4, 0.9])]
plot_decision_boundaries(X_learn_reduced,y_train[idx_learn],ax=axs[0],model=model_knn,legend=classnames,title='KNN')
plot_decision_boundaries(X_learn_reduced,y_train[idx_learn],ax=axs[1],model=model_lda,legend=classnames,title='LDA')
plt.show()

**Question**: 
- From the decision boundaries shown here above, can you explain why the ``handsaw`` class is less often chosen than the other classes for the ``KNN`` classifier?

Parce que la classe Handsaw se trouve au centre des autres classes. Ce qui n'est pas le cas en LDA. EN 2D

In [None]:
### TO RUN
n=5 # Number of principal components kept
pca = PCA(n_components=n,whiten=True)
X_learn_reduced = pca.fit_transform(X_train[idx_learn])
X_val_reduced = pca.transform(X_train[idx_val])

print('Shape of the reduced learning matrix : {}'.format(X_learn_reduced.shape))

K = 10
model_knn = KNeighborsClassifier(n_neighbors=K, weights='distance')
model_knn.fit(X_learn_reduced, y_train[idx_learn])
prediction_knn = model_knn.predict(X_val_reduced)
accuracy_knn = get_accuracy(prediction_knn, y_train[idx_val])

model_lda = LDA()
model_lda.fit(X_learn_reduced, y_train[idx_learn])
prediction_lda = model_lda.predict(X_val_reduced)
accuracy_lda = get_accuracy(prediction_lda, y_train[idx_val])

print('Accuracy of the KNN : {:.1f}%'.format(100*accuracy_knn))
show_confusion_matrix(prediction_knn, y_train[idx_val], classnames)
print('Accuracy of the LDA : {:.1f}%'.format(100*accuracy_lda))
show_confusion_matrix(prediction_lda, y_train[idx_val], classnames)

<font size=5 color=#009999> 2.5. Analysis of the hyperparameters </font> <br>

Finally, we can inspect the influence of ``hyperparameters`` as we did for the toy example. <br>
Here we consider both ``K`` and the number of principal components ``n``.

In [None]:
### TO RUN
Ks = np.arange(1,10)
n_comps = np.arange(2, 15) # number of principal components kept for the PCA
accuracies_knn = np.zeros( (len(Ks), len(n_comps)) )
accuracies_lda = np.zeros(len(n_comps)) 

for j, n in enumerate(n_comps):
    for idx_learn, idx_val in kf.split(X_train,y_train):
        pca = PCA(n_components=n,whiten=True)
        X_learn_reduced = pca.fit_transform(X_train[idx_learn])
        X_val_reduced = pca.transform(X_train[idx_val])
        for i,K in enumerate(Ks):
            model_knn = KNeighborsClassifier(n_neighbors=K)
            model_knn.fit(X_train[idx_learn], y_train[idx_learn])
            prediction_knn = model_knn.predict(X_train[idx_val])
            accuracies_knn[i,j] += get_accuracy(prediction_knn, y_train[idx_val])
        
        model_lda = LDA()
        model_lda.fit(X_train[idx_learn], y_train[idx_learn])
        prediction_lda = model_lda.predict(X_train[idx_val])
        accuracies_lda[j] += get_accuracy(prediction_lda, y_train[idx_val])

accuracies_knn /= n_splits
accuracies_lda /= n_splits

fig = plt.figure(figsize=(10,4))
axs = [fig.add_axes([0.0, 0.0, 0.4, 0.9]), fig.add_axes([0.6, 0.0, 0.4, 0.9])]

im0 = axs[0].imshow(100*accuracies_knn, cmap='jet', origin='lower')
cbar = fig.colorbar(im0, ax=axs[0])
cbar.set_label('Accuracy (%)')
axs[0].set_xlabel('n_PCA')
axs[0].set_ylabel('K')
axs[0].set_xticks(list(np.arange(len(n_comps))))
axs[0].set_xticklabels(list(n_comps))
axs[0].set_yticks(list(np.arange(len(Ks))))
axs[0].set_yticklabels(list(Ks))
axs[0].set_title('KNN')

axs[1].plot(accuracies_lda*100)
axs[1].set_xlabel('n_PCA')
axs[1].set_ylabel('Accuracy (%)')
axs[1].set_title('LDA')
plt.show()

**Question**: 
- Do you observe some dependency of the accuracy on these parameters? If so, which one(s)? If not, discuss what it tells about the considered model. 

Les points des différentes categories sont initialement répartis d'une manière si random (tous mélangés les uns aux autres) que si le k augmente de trop, le classificateur prendra juste des éléments au hasard et ca sera pas concluant. Mais du coup au lieu de vouloir optimiser le classificateur en lui meme, mieux vaut d'abord "nettoyer" nos données initiales (améliorer la répartition des points au sein de l'espace). 

<font size=5 color=#009999> 2.6. Augmenting the data </font> <br>

In order to make our classifier more robust to some common transformations of the audio signal such as ``time shift``, ``scaling`` or ``AWGN``, we need to feed it with such transformations. A popular approach is to create new feature vectors based on transformed versions of the sounds from the original dataset, this is called ``data augmentation``. Data augmentation is also often used when there is few data to train a model. <br>

The functions to augment your data are written in ``AudioUtil_And_Dataset.py``, we already implemented ``time_shift``, ``echo`` and ``spectro_aug_timefreq_masking`` for you. Try to implement ``scaling``, ``add_noise``, ``filter``, ``add_bg`` and even more data augmentation techniques if you want, and check their working in the cell below. <br>

<u>Tip</u>: to avoid restarting the notebook kernel for each modification, you can temporarily insert the ``AudioUtil`` class in a new cell and make your tests until it is working as expected. 

In [None]:
### TO RUN
myds.data_aug = None # Ensure

sound = allpath_mat[14,29]
audio = AudioUtil.open(sound)

AudioUtil.play(audio)
audio2 = AudioUtil.resample(audio, 11025)
audio2 = AudioUtil.pad_trunc(audio2, 5000)

audio3 = AudioUtil.time_shift(audio2, 0.4)
audio4 = AudioUtil.scaling(audio2) 
audio5 = AudioUtil.add_noise(audio2, sigma=1e-2)
audio6 = AudioUtil.echo(audio2, 3)
audio7 = AudioUtil.add_bg(audio2, allpath_nonsel)

melspec = AudioUtil.melspectrogram(audio2, fs2=11025)
melspec2 = AudioUtil.spectro_aug_timefreq_masking(melspec, max_mask_pct=0.1)

"Plot"
fig = plt.figure(figsize=(15,4))
ax1 = fig.add_axes([0.05, 0.05, 0.28, 0.9])
ax2 = fig.add_axes([0.38, 0.05, 0.28, 0.9])
ax3 = fig.add_axes([0.7, 0.05, 0.28, 0.9])

ax1.plot(audio2[0], label='Original')
ax1.plot(audio3[0]+1, label='Time shifted')
ax1.plot(audio4[0]+2, label='Rescaled')
ax1.plot(audio5[0]+3, label='Noisy')
ax1.plot(audio6[0]+4, label='With echos')
ax1.plot(audio7[0]+5, label='With background sound')
ax1.legend()

plot_specgram(melspec, ax2, is_mel=True, title=getname(sound), tf = len(audio2[0])/audio2[1])
ax2.set_title('Melspectrogram')
plot_specgram(melspec2, ax3, is_mel=True, title=getname(sound), tf = len(audio2[0])/audio2[1])
ax3.set_title('Corrupted melspectrogram')
plt.show()

We can now create a new augmented dataset and observe if the classification results improve. 

In [None]:
### TO RUN
myds.mod_data_aug(['add_bg'])
sel_class_ids = np.arange(len(sel_class))
sel_class_ids_aug = np.repeat(sel_class_ids, num_sounds*myds.data_aug_factor)

"Compute the matrixed dataset, this takes some seconds, but you can then reload it by commenting this loop and decommenting the np.load below"
X_aug = np.zeros((myds.data_aug_factor*num_classes*num_sounds, featveclen))
for s in range(myds.data_aug_factor):
    for i, featvec in enumerate(myds):
        X_aug[s*num_classes*num_sounds+i,:] = featvec[0]
np.save("feature_matrix_2D_aug.npy", X_aug)

# X_aug = np.load("feature_matrix_2D_aug.npy")

"Labels"
y_aug = sel_class_ids_aug

print('Shape of the feature matrix : {}'.format(X_aug.shape))
print('Number of labels : {}'.format(len(y_aug)))

In [None]:
### TO RUN
K = 10 # Number of neighbours
model = KNeighborsClassifier(n_neighbors=K) 

accuracy_aug = np.zeros((n_splits,))
for k, idx in enumerate(kf.split(X_aug,y_aug)):
  (idx_train, idx_test) = idx
  model.fit(X_aug[idx_train], y_aug[idx_train])
  prediction_aug = model.predict(X_aug[idx_test])
  accuracy_aug[k] = get_accuracy(prediction_aug, y_aug[idx_test])

print('Mean accuracy with 5-Fold CV: {:.1f}%'.format(100*accuracy_aug.mean()))
print('Std deviation in accuracy with 5-Fold CV: {:.1f}%'.format(100*accuracy_aug.std()))
show_confusion_matrix(prediction_aug, y_aug[idx_test], classnames)

**Question**:
Can you see an improvement of the classification result compared to the non augmented dataset? <br>
Try to interpret your answer by thinking about the distribution of points in a data space (as with the toy example), what does it imply to augment the data in terms of distribution of points in the data space?

In [None]:
### TO COMPLETE
# Answer the question above

<font size=5 color=#009999> 2.7. Getting it all together </font> <br>

Now that some aspects to be considered during the model training and analysis have been presented, it remains to train and save a final model that will be used for further predictions.

In [None]:
### TO COMPLETE

# [1] Create dataset and split it.
# (optional) with data augmentation
myds = SoundDS(all_sel_class_ids, Nft=512, nmel=20, duration=750, shift_pct=0.0, data_path=data_path, allpath_mat=allpath_mat)

# X_train, X_test, y_train, y_test = ...
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) # random_state=1

# [2] (optional) Data normalization
X_learn_normalised = X_train/np.linalg.norm(X_train, axis=1, keepdims=True)
# [3] (optional) dimensionality reduction. NOT DONE 

# [4] Model training and selection.
K = 10 
model = KNeighborsClassifier(n_neighbors=K, weights='distance', algorithm='auto', metric='minkowski') #We explicitly write the default parameters of this KNN classifier once so that you know they exist and can be changed
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits,shuffle=True)
accuracy_knn = np.zeros((n_splits,))
for k, idx in enumerate(kf.split(X_train,y_train)):
  (idx_learn, idx_val) = idx
  model.fit(X_learn_normalised[idx_learn], y_train[idx_learn])
  prediction_knn = model.predict(X_train[idx_val])
  accuracy_knn[k] = get_accuracy(prediction_knn, y_train[idx_val])

# [5] Save the trained model, eventually the pca.
filename = 'model.pickle'
pickle.dump(model, open(filename, 'wb'))

# [6] Evaluate the model

X_test = X_test/np.linalg.norm(X_test, axis=1, keepdims=True)
prediction = model.predict(X_test)
show_confusion_matrix(prediction, y_test, classnames)
accuracy = get_accuracy(prediction, y_test)
recall = recall_score(y_test,prediction, average = None)
print('Accuracy of KNN: {:.1f}%'.format(100*accuracy))
print(recall)
#print('Recall of KNN: {:.1f}%'.format(100*recall))

<font size=5 color=#009999> 2.8. Debriefing </font> <br>
**Questions** : 

1) from what we have done in this notebook, can you already identify some weaknesses in the feature vector computation and classification pipeline? You can make a list here below, and eventually write some short ideas for improvement. This will help you later :)
2) Do you remember what is the time duration of a feature vector? What happens if no sound is produced during the acquisition time of a feature vector?

In [None]:
### TO COMPLETE
# Answer the question above