<a href="https://colab.research.google.com/github/elhamod/BA865-2024/blob/main/hands-on/Audio_with_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, you will learn how to deal with audio data.

In [None]:
import os
import librosa   #for audio processing
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import torch


## Data Downloading

Next, we'll download and unzip the dataset of speech commands from tensorflow: http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz

Below we load the data and print the number of examples.

In [None]:
import torch
from pathlib import Path

def getData(sampling=16000, use_MFCC=True, n_mfcc=12, number_of_samples={'yes':2000, 'no': 2000}, noise=.0, normalize=True):
  #download the data
  my_file = Path("/content/speech_commands/")
  if not my_file.exists():
    !wget http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
    !mkdir speech_commands
    !tar -C ./speech_commands -xf speech_commands_v0.01.tar.gz

  directory = 'speech_commands/'
  all_wavs = [] # The inputs
  all_labs = [] # The outputs

  # create the dataset as arrays
  for label in number_of_samples.keys():
      wavs = [f for f in os.listdir(directory + label) if f.endswith('.wav')]
      for indx, wav in enumerate(wavs):
          samples, sample_rate = librosa.load(directory + label + '/' + wav, sr = 16000)
          samples = librosa.resample(samples, orig_sr=16000, target_sr=sampling) # Resamples the audio to possibly lower sampling rate: https://librosa.org/doc/main/generated/librosa.resample.html

          if noise != 0:
            samples = samples+noise*np.random.randn(*samples.shape)
            samples = samples/max(samples)

          if number_of_samples[label] > indx:
            if(len(samples)== sampling): # makes sure all samples have the same length
                all_wavs.append(samples)
                all_labs.append(label)

  # applying MFCC
  if use_MFCC:
    all_wavs = librosa.feature.mfcc(y=np.array(all_wavs), sr=sampling, n_mfcc=n_mfcc)
  else:
    all_wavs = np.array(all_wavs)

  all_wavs = torch.tensor(all_wavs).float()
  if not use_MFCC:
    all_wavs = all_wavs.unsqueeze(-1)

  # Output encoding
  from sklearn.preprocessing import LabelEncoder
  le = LabelEncoder()
  all_labs_encoded = le.fit_transform(all_labs)
  all_labs_encoded = torch.LongTensor(all_labs_encoded)

  print("Total number of samples:", len(all_wavs))

  return all_wavs, all_labs_encoded

In [None]:
### Some data parameters

# Sampling (lower means less crisp audio)
sampling = 400 # Sampling rate (e.g., 16000). Smaller (e.g., 400 seems to work better)

# MelSpectogram params: convert raw data to frequency domain
use_MFCC = False
n_mfcc = 12 # input audio data size (higher -> more crisp sound) (e.g. 12)

# Other data params
number_of_samples = {'yes':2000, 'no': 2000} # We will try to classify between two classes. This defines the number of samples per class.
noise = 0. # If not zero, it makes the audio proportionally noisier (range: 0-1)

In [None]:
audio_data, audio_label = getData(sampling=sampling, use_MFCC=use_MFCC, n_mfcc=n_mfcc, number_of_samples=number_of_samples, noise=noise)

Total number of samples: 3589


In [None]:
audio_data.shape

torch.Size([3589, 400, 1])

Playing an example

In [None]:
indx_to_play = 0

# Only works if MFCC is not used
print(audio_label[indx_to_play].item())
ipd.Audio(audio_data[indx_to_play].squeeze(),rate=sampling,autoplay=True)

1


## Build the model

In [None]:
def get_accuracy(dataloader, model):
  acc = 0

  with torch.no_grad():
      for audios, labels in dataloader:
          if use_cuda:
            audios = audios.cuda()
            labels = labels.cuda()

          outputs = model(audios)

          # Update accuracy for this batch
          acc = acc + torch.sum(torch.argmax(outputs, axis=1) == labels)


      # Compute the accuracy
      acc = acc/len(dataloader.dataset) # normalizes

      return acc


In [None]:

def get_loss(loader):
  with torch.no_grad(): # Anything under torch.no_grad will be calculated with no gradients. Can only be used for testing, not training!

    loss = 0
    for i, (audios, labels) in enumerate(loader): # The batches.
          # step1: Move data to cuda. Make sure the model is on cuda too!
          if use_cuda:
            audios = audios.cuda()
            labels = labels.cuda()

          # step2: Forward pass
          outputs = model(audios)

          # step 3: calculate the loss.
          loss = loss + audios.shape[0] * criterion(outputs, labels.view(-1))
    return loss/ len(loader.dataset)

In [None]:
import torch.nn as nn

# Define the RNN classification model
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
      ######


    def forward(self, x):
      ######


##Training

In the next section, you will define your experiment's parameters and model's hyperparameters as flags here. Use these flags in your code so you can switch between experiemnts easily.

In [None]:
use_cuda = torch.cuda.is_available()

# Data
batch_size = 32 #(e.g., 32)

# optimizer
lr= 10e-4
epochs=2000

# model
model_name = "RNN"
hidden_size = 50 # hn for the RNN (e.g., 10)
num_layers = 3 # Number of layers for RNN (e.g. 3)

In [None]:
audio_data.shape

torch.Size([3589, 400, 1])

### Data Loading

In [None]:
from torch.utils.data import DataLoader
import torch
from torchvision import transforms

def getDataLoaders(audio_data, audio_label, batch_size):
  # # normalization
  audio_data_mean = audio_data.mean(1).unsqueeze(1)
  audio_data_std = audio_data.std(1).unsqueeze(1)
  audio_data_transformed = (audio_data - audio_data_mean)/ audio_data_std

  hist = torch.histogram(audio_data_transformed)

  plt.plot(hist.bin_edges[:-1], hist.hist, color="r")


  # Load the data loaders
  my_dataset = torch.utils.data.TensorDataset(audio_data_transformed, audio_label)
  train_set, val_set = torch.utils.data.random_split(my_dataset, [0.8, 0.2])

  train_loader = DataLoader(dataset=train_set, batch_size=batch_size,  shuffle=True)
  val_loader = DataLoader(dataset=val_set, batch_size=batch_size,  shuffle=False)

  return train_loader, train_set, val_loader, val_set

In [None]:
train_loader, train_set, val_loader, val_set = getDataLoaders(audio_data, audio_label, batch_size)

In [None]:
train_set[0][0].shape

## Classification

In [None]:
# !pip install torchinfo

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchinfo import summary

# Create an instance of the RNN classification model
input_size = train_set[0][0].shape[-1]
num_classes = 2
model = #####
print(model)
print(summary(model, train_set[0][0].unsqueeze(0).shape))


if use_cuda:
  model = model.cuda()

import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=20, factor=0.9) #(e.g., factor=0.5)

for epoch in range(epochs): # The epochs.
    for i, (audios, labels) in enumerate(train_loader): # The batches.
        # step 1: Zero out the gradients.
        optimizer.zero_grad()

        # step 1.1 move data to cuda. Make sure the model is on cuda too!
        if use_cuda:
          audios = audios.cuda()
          labels = labels.cuda()

        # print('labels', labels)


        # step2: Forward pass
        outputs = model(audios)

        # print('outputs', outputs)

        # step 3: calculate the loss.
        loss = criterion(outputs, labels.view(-1))

        # step 4: Backward pass
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        #Print the loss
        if epoch %1 == 0 and i %300 == 0:
          print("Epoch", epoch+ 1, " batch", i+1, ". Training Loss: ", loss.item())

            # Print the loss
    scheduler.step(get_loss(val_loader))
    if epoch %1 == 0:
      print("Epoch", epoch+ 1,  'acc: ', get_accuracy(train_loader,model).item(), 'val_acc: ', get_accuracy(val_loader,model).item())





In [None]:
import torch
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Assuming you have a trained model and a DataLoader for test data
model.eval()  # Set the model to evaluation mode

# Initialize empty lists for predictions and ground truth labels
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in val_loader:  # Replace 'dataloader' with your actual DataLoader
        inputs = inputs.cuda()
        labels = labels.cuda()
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)  # Get predicted class labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute the confusion matrix
cm = confusion_matrix(all_labels, all_preds)

# Plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)  # Replace 'class_names' with your class labels
disp.plot(cmap=plt.cm.Blues, values_format=".0f")
plt.title("Confusion Matrix")
plt.show()
