In [None]:
# In this notebook we will optimize a softmax classifier

# We will work with the PCVD dataset

# go to this website and click on Download
# https://www.kaggle.com/sabermalek/pcvcspeech

# Then put the files in the same directory as this .ipynb file

In [None]:
# This cell loads and processes the data
# you do not have to do anything here

# The libraries needed
import os
import scipy.io
from scipy.signal.windows import hann
import numpy as np
import librosa

# list all the files that are part of the dataset
all_mats = [i for i in os.listdir('.') if 'mat' in i]

# load the time-series data in each of the data files
# and store them in a numpy array
data = []
for mat in all_mats:
  d = scipy.io.loadmat(mat)['x']
  data.append(d)
data = np.concatenate(data,axis=1)

# reshape the data so that we have a matrix where each
# row is a datapoint (i.e. a vowel-consonant utterance)
_,nreps,nvow,nsamps=data.shape
data = np.reshape(data,(nreps*nvow,nsamps),order='F')

# window the data to reduce the number of samples
# and center the window around the vowel
data = data[:,5000:15000]*hann(10000)

# finally, resample the data have a sampling
# rate of 16000
sr = 16000
X = []
for d in data:
  X.append(librosa.resample(d,48000,sr))
data = np.array(X)

print("The shape of the data is", data.shape)

The shape of the data is (1794, 3334)


In [None]:
# now listen a few example datapoints
# remember, each row of "data" is a datapoint
from IPython.display import Audio
Audio(data=data[0,:], rate=sr)

In [None]:
# "data" has the same number of datapoints for each vowel
# In farsi, there are 6 vowels. Considering the number of
# datapoints in "data". How many points do you have per vowel?

ndatapoints_per_vowel = # ?

# now, the first ndatapoints_per_vowel rows in "data" contain
# datapoints tha correspond to the vowel "a". The next
# ndatapoints_per_vowel rows correspond to the vowel "i", etc.

# we need to create a "labels" matrix with the same number of rows
# as "data", and six columns. In "labels", each row has a 1 and
# the rest entries are zeros. The location of the number 1
# indicates which vowel the corresponding row of "data" has 
# time-series data for

labels = # your code here

print("the shape of labels is",labels.shape)

the shape of labels is (1794, 6)


In [None]:
# now randomly select ~5% of rows in "data" to be the test set
# Hint: you can use the np.random.choice function (with replace=False) 
# and use the first ~5% of its output to index out the test set
# the remaining datapoints will be the "development" set

all_idx = np.random.choice(# your code here

data_ts = # your code here
labels_ts = # your code here
data_dv = # your code here
labels_dv = # your code here

print("The shape of the development data is ", data_dv.shape)
print("The shape of the development labels is ", labels_dv.shape)
print("The shape of the testing data is ", data_ts.shape)
print("The shape of the testing labels is ", labels_ts.shape)

The shape of the development data is  (1704, 3334)
The shape of the development labels is  (1704, 6)
The shape of the testing data is  (90, 3334)
The shape of the testing labels is  (90, 6)


In [None]:
# now we randomly select ~15% of the development
# data to be your validation set, and the rest to be your training
# set. In this homework we will NOT to k-fold cross-validation.

# Q: why are we using only one fold as the validation set? Why are we not doing k-folds?
# A:

all_idx = np.random.choice(# your code here

Xvl = # your code here
Yvl = # your code here
Xtr = # your code here
Ytr = # your code here

print("The shape of the taining data is ", Xtr.shape)
print("The shape of the training labels is ", Ytr.shape)
print("The shape of the validation data is ", Xvl.shape)
print("The shape of the validation labels is ", Yvl.shape)

The shape of the taining data is  (1449, 3334)
The shape of the training labels is  (1449, 6)
The shape of the validation data is  (255, 3334)
The shape of the validation labels is  (255, 6)


In [None]:
# now we have to standardize the data.

# Here each datapoint is a time-series. Additionally, we have
# a very limited number of datapoints. As a result, we must
# standardize each datapoint separately. Fortunately, audio time-series
# can be normalized to have zero mean and values that in the
# range of values between -1 and 1.

# standardize the training and validation data so that each datapoint
# has a mean centered around zero, and the largest value magnitude in a datapoint is 1

mu_tr = # your code here
max_tr = # your code here
mu_vl = # your code here
max_vl = # your code here

Xtr = (Xtr-mu_tr)/max_tr
Xvl = (Xvl-mu_vl)/max_vl

In [None]:
# we have a very limited number of training data. 
# as a result, we must "augment" the number of training datapoints
# here we suggest that you augment the data by adding noise to it
# and randomly shift its pitch. However, you should consider augmenting
# your data with even more techniques. 

# Q: why should we augment our data?
# A:

# create a copy of your training data to add gaussian noise 
# with a small variance
Xnoise = Xtr + # your code here

# create a copy of your training data to randomly shift 
# the pitch of each datapoint by a few semitones
pitch_factors = # your code here
Xpitch = []
for i, x in enumerate(Xtr):
  Xpitch.append(librosa.effects.pitch_shift(# your code here

# now concatenate your original data with the augmented datapoints
Xtr = np.concatenate((Xtr,Xnoise,np.array(Xpitch)),axis=0)
Ytr = np.concatenate((Ytr,Ytr,Ytr),axis=0)

print("The shape of the training data is ", Xtr.shape)
print("The shape of the training labels is ", Ytr.shape)
print("The shape of the validation data is ", Xvl.shape)
print("The shape of the validation labels is ", Yvl.shape)

# you should consider applying more data augmentation in order
# to be able to train a more robust model. However, not all
# data augmentation for audio will be good given what we are trying to do here

# CAUTION: Running this cell more than once can result in your number of
# training datapoints growing expontentially, but being very redundant

The shape of the training data is  (4347, 3334)
The shape of the training labels is  (4347, 6)
The shape of the validation data is  (255, 3334)
The shape of the validation labels is  (255, 6)


In [None]:
# use this cell to hear the differences between your
# original training data and its "augmented" versions
from IPython.display import Audio
Audio(data=Xtr, rate=sr)

In [None]:
# Here's the main body of this homework

# we will use for loops to explore the hyperparameter space
# and find the best training routine for our softmax classifier. 

# Here is a hint: the number of epochs used to obtain the
# baseline model was 5000. To beat the baseline model you may 
# have to train for more (or maybe you can do less) epochs.
epochs = 5000

# define a list with regularization values that you want to try
regs = # your code here
# define a list with learning rates that you want to try
lrs = # your code here

# for each combination of reg+lr values, we will save
# the best parameters, weights, and cost functions
results = []
for reg in regs:
  for lr in lrs:
    
    print("\n####################################\n")

    # intialize the model parameters that we want 
    # to learn using the training data
    W = # your code here
    b = # your code here

    # initialize the "best validation cost function" to be "infinity"
    best_Jvl = float("inf")
    # initialize the "best" W and the "best" b
    best_W = np.array(W)
    best_b = np.array(b)

    # initialize a list to save all the
    # training and validation cost at each epoch
    all_Jtr = []
    all_Jvl = []
        
    # now show the training data 
    # to the model and optimize
    # via gradient descent
    for e in range(epochs):

      # calculate y_hat with the training data
      # Q: what do you expect the initial average y_hat_tr (training) to be? why?
      # A:   
      y_hat_tr = # your code here
      
      # calculate y_hat with the validation data
      y_hat_vl = # your code here
      
      # calculate the cost function with the training data (do not forget to regularize W with L2)
      # Q: what do you expect the initial J with the training data to be? why?
      # A:   
      Jtr = # your code here
      all_Jtr.append(Jtr)
      # Q: why don't we regularize "b"?
      # A:
      
      # calculate the cost function with the validation data (regularizing W with L2)
      Jvl = # your code here
      all_Jvl.append(Jvl)
      
      # save the best validation cost
      if Jvl < best_Jvl:
        best_Jvl = Jvl
        best_W = np.array(W)
        best_b = np.array(b)
        
      # Let's print some progress indicators
      if e%500 == 0:
        print("epoch {} with reg {} and lr {}, Jtr = {}".format(e,reg,lr,Jtr))
        print("           with reg {} and lr {}, Jvl = {}".format(reg,lr,Jvl))      
      # Q: will Jtr and Jvl be the same, roughly the same, or always different?
      # A:
      # Q: what relationship do you expect to see between Jtr and Jvl?
      # A:
      # Q: how will you know if you model is overfitting to the training data?
      # A:      

      # now find the gradient of the parameters
      # that we want to optimize. Do not forget the L2 regularization applied to W
      dw = # your code here
      db = # your code here
      # Q: why do we need to apply regularization to W?
      # A:
      # Q: what effect does regularization have on the cost function?
      # A:
      # Q: why do we not apply regularization to b?

      # update the model parameters
      W -= lr*dw.T
      b -= lr*db
    
    # after the epochs are over, save the training and validation losses, 
    # as well as the best_W and best_b parameters you found
    results.append([lr, reg, best_Jvl, all_Jtr, all_Jvl, best_W, best_b])

In [None]:
# PERFORMANCE: validation set

# now that training is over, let's see how the parameters we found
# for each combination of regularization and learning rate perform 
# on the VALIDATION set
import matplotlib.pyplot as plt
import sklearn

for p in results:

  # unpack the variables needed to compute the results
  lr, reg, best_Jvl, all_Jtr, all_Jvl, best_W, best_b = p

  print("\n\n\nWith a learning rate of {} and a regularization of {}\n".format(lr, reg))  
  
  # let's plot the loss over epochs
  plt.plot(all_Jtr,label='Jtr')
  plt.plot(all_Jvl,label='Jvl')  
  plt.xlabel('epoch')
  plt.legend()
  plt.show()

  # calculate y_hat_vl using the best parameters found
  y_hat_vl = # your code here
  
  # calculate the model accuracy
  acc = # your code here
  print("The accuracy was", acc)
  
  # compute the confusion matrix and plot it
  conf_mat = sklearn.metrics.confusion_matrix(# your code here
  disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=conf_mat,display_labels=["a","i","u","ae","e","o"])
  disp.plot()
  plt.show()

# to beat the "baseline" model, you will have to do
# several types of data augmentation and thoroughly explore
# the hyperparameter space (i.e. combinations of regularization and learning rate)

In [None]:
# Here's something kinda cool. Since each column of W helps separating
# a different vowel, after training W, you can "hear the vowels" in the 
# columns of the best weights you got.
Audio(data=results[0][5][:,0], rate=sr) # you will have to index results properly to access the best_W you found

In [None]:
# when you are done optimizing your absolue best model, carry out the same performance
# evaluation with the test set. Do not forget to standardize your data. 
# When you are done, submit a picture of your final evaluation as a post to our subreddit
# https://www.reddit.com/r/deeplearningaudio/

# post the accuracy and the confusion matrix on the evaluation set

mu_ts = # your code here
max_ts = # your code here

Xts = (data_ts-mu_ts)/max_ts

# unpack the parameters that gave you the best results
lr, reg, best_Jvl, all_Jtr, all_Jvl, best_W, best_b = # your code here

print("\n\n\nWith a learning rate of {} and a regularization of {}\n".format(lr, reg))  

# let's plot the loss over epochs
plt.plot(all_Jtr,label='Jtr')
plt.plot(all_Jvl,label='Jvl')  
plt.xlabel('epoch')
plt.legend()
plt.show()

# calculate y_hat_vl using the best parameters found
y_hat_ts = # your code here

# calculate the model accuracy
acc = # your code here
print("The test-set accuracy was", acc)

# compute the confusion matrix and plot it
conf_mat = sklearn.metrics.confusion_matrix(# your code here
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=conf_mat,display_labels=["a","i","u","ae","e","o"])
disp.plot()
plt.show()


In [None]:
# you are done with the homework. The next cell lets you record 2 seconds of
# audio. Then you can use your model to infer the vowel you said in the audio recording.

In [None]:
#@markdown Either record audio from microphone or upload audio from file (.mp3 or .wav) { run: "auto" }

# imports for uploading/recording
%cd ~
!pip install pydub
!git clone -q --depth 1 https://github.com/snakers4/silero-models
%cd silero-models
import numpy as np
import ipywidgets as widgets
from scipy.io import wavfile
from IPython.display import Audio, display, clear_output
from colab_utils import (record_audio,
                         audio_bytes_to_np,
                         upload_audio)

record_or_upload = "Record" #@param ["Record", "Upload (.mp3 or .wav)"]
record_seconds =   2#@param {type:"number", min:1, max:10, step:1}
sample_rate = 16000
  
def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds)
  wavfile.write('recorded.wav', sample_rate, (32767*audio).numpy().astype(np.int32))  

def _upload_audio(b):
  clear_output()
  audio = upload_audio()  
  return audio

if record_or_upload == "Record":
  button = widgets.Button(description="Record Speech")
  button.on_click(_record_audio)
  display(button)
else:
  audio = _upload_audio("")

/root
fatal: destination path 'silero-models' already exists and is not an empty directory.
/root/silero-models


Button(description='Record Speech', style=ButtonStyle())

In [None]:
# this cell is very "hacky" and assumes that you were constantly
# saying a vowel for the duration of a two second recording

# load the audio
audio, sr = librosa.load('recorded.wav',sr=sr)

# window to match the audio length that we trained the model to recognize
audio = audio[np.newaxis,sr//2:sr//2+3334]*hann(3334)

# normalize the audio as we did before
mu_rec = # your code here
max_rec = # your code here
x = (audio-mu_rec)/max_rec

# unpack the parameters that gave you the best results
lr, reg, best_Jvl, all_Jtr, all_Jvl, best_W, best_b = # your code here

# carry out inference with your best parameters
y_hat_rec = # your code here

# print results and hear the recording
vowels = ['a','i','u','ae','e','o']
print('You said the vowel ',vowels[np.argmax(y_hat_rec)])
Audio(data=x, rate=sr)

You said the vowel  e
