# Program 3: Audio Classification

In [1]:
!pip3 install torch torchvision torchaudio torchsummary --index-url https://download.pytorch.org/whl/cu126

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu126


## Importing Libraries

In [2]:
import os
import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torchaudio
from torchsummary import summary

## Creating Custom Dataset Class

In [3]:
class AudioDataset(Dataset):
    def __init__(self, wav_directory, transformation, target_sample_rate, num_samples, device, label_file_path=None):
        
        if label_file_path is None:
            self.labels = None
        else:
            self.labels = pd.read_csv(label_file_path, header=None)   
            
        self.wav_directory = wav_directory
        self.device = device
        #register transformation onto the device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        if self.labels is None:
            return 0

        return len(self.labels)

    def __getitem__(self, index):
        wav_sample_path = self._get_wav_sample_path(index)
        
        signal, sample_rate = torchaudio.load(wav_sample_path)

        #register signal onto the device
        signal = signal.to(self.device)

        #alter sample_rate to target_sample_rate
        signal = self._resample(signal, sample_rate)
        
        #make signal into mono channel
        signal = self._mix_down(signal)

        ###ALTHOUGH IT IS GIVEN THAT EACH SIGNAL IS 5 SEC, CUT AND RIGHT PAD ENSURES THAT ALL AUDIO HAS THE SAME # of SAMPLES
        #print(signal.shape)
        
        #cut if signal has more than expected num_samples
        signal = self._cut(signal)

        #right pad if signal has less than expected num_samples
        signal = self._right_pad(signal)
        
        signal = self.transformation(signal)

        if self.labels is None:
            return signal

        label = self._get_audio_sample_label(index)
        
        return signal, label

    def _get_wav_sample_path(self, index):
        wav_file_name = f"{int(index+1):05d}.wav"
        path = os.path.join(self.wav_directory, wav_file_name)
        return path

    def _get_audio_sample_label(self, index):
        if self.labels is None:
            return signal
        return self.labels.iloc[index, 0]

    def _resample(self, signal, sample_rate):
        #only apply if sample rate is not equal to target sample rate
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down(self, signal):
        #only apply if signal has more than 1 channel
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _cut(self, signal):
        #only apply if signal has more samples than num_samples
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal
        
    def _right_pad(self, signal):
        #only apply if signal has less samples than num_samples
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_zeros = self.num_samples - length_signal
            #appends number of zeros that need to be filled on second axis of signal
            last_dim_padding = (0, num_zeros)
            signal = nn.functional.pad(signal, last_dim_padding)
        return signal

## Creating Custom CNN Network

In [6]:
class AudioCNN(nn.Module):
    def __init__(self):
        super().__init__()
        #4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
            #nn.BatchNorm2d()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        #self.linear = nn.Linear(128*5*11, 25)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(128*5*11, 26)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.dropout(x)
        logits = self.linear(x)
        #predictions = self.softmax(logits)
        predictions = logits
        return predictions
        #return x

In [7]:
cnn = AudioCNN()
summary(cnn.cuda(), (1, 64, 157))
#print(cnn.cuda())

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 16, 66, 159]             160
              ReLU-2          [-1, 16, 66, 159]               0
         MaxPool2d-3           [-1, 16, 33, 79]               0
            Conv2d-4           [-1, 32, 35, 81]           4,640
              ReLU-5           [-1, 32, 35, 81]               0
         MaxPool2d-6           [-1, 32, 17, 40]               0
            Conv2d-7           [-1, 64, 19, 42]          18,496
              ReLU-8           [-1, 64, 19, 42]               0
         MaxPool2d-9            [-1, 64, 9, 21]               0
           Conv2d-10          [-1, 128, 11, 23]          73,856
             ReLU-11          [-1, 128, 11, 23]               0
        MaxPool2d-12           [-1, 128, 5, 11]               0
          Flatten-13                 [-1, 7040]               0
          Dropout-14                 [-

## Define Training Methods

In [8]:
def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader

def train_single_epoch(model, data_loader, loss_fn, optimizer, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        prediction=model(input)
        loss = loss_fn(prediction, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"loss: {loss.item()}")

def train(model, data_loader, loss_fn, optimizer, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimizer, device)        
        print("----------------------")
    print("Finished Training")

## Training the Model

In [10]:
TRAIN_LABELS_FILE="/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/train/labels.txt"
TRAIN_WAV_DIR="/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/train"

BATCH_SIZE=128
EPOCHS=50
LEARNING_RATE=0.001

#5 second audio signals
SAMPLE_RATE=16000
NUM_SAMPLES= 80000

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Using device: {device}")

#instantiate transformer
mel_spectogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

#instantiate dataset object
train_dataset = AudioDataset(TRAIN_WAV_DIR, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, device, TRAIN_LABELS_FILE)

print(f"There are {len(train_dataset)} samples in the dataset.")
signal, label = train_dataset[2]
print(label)
#print(signal.shape)

#instatnitate dataloader
train_dataloader = create_data_loader(train_dataset, BATCH_SIZE)

cnn = AudioCNN().to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

train(cnn, train_dataloader, loss_fn, optimizer, device, EPOCHS)

torch.save(cnn.state_dict(), "/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/saved_models/program3_2.pth")
print("Trained cnn saved")


Using device: cuda
There are 350 samples in the dataset.
1
Epoch 1
loss: 30.611759185791016
----------------------
Epoch 2
loss: 8.380337715148926
----------------------
Epoch 3
loss: 2.9110939502716064
----------------------
Epoch 4
loss: 2.344248056411743
----------------------
Epoch 5
loss: 2.192189931869507
----------------------
Epoch 6
loss: 2.0632848739624023
----------------------
Epoch 7
loss: 2.037885904312134
----------------------
Epoch 8
loss: 1.9379204511642456
----------------------
Epoch 9
loss: 1.8714059591293335
----------------------
Epoch 10
loss: 1.749122142791748
----------------------
Epoch 11
loss: 1.6265904903411865
----------------------
Epoch 12
loss: 1.5361310243606567
----------------------
Epoch 13
loss: 1.444130539894104
----------------------
Epoch 14
loss: 1.3229265213012695
----------------------
Epoch 15
loss: 1.2557774782180786
----------------------
Epoch 16
loss: 1.1826640367507935
----------------------
Epoch 17
loss: 1.1635364294052124
----------

## Evaluating the Model

In [13]:

def predict(model, input, target):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        predicted = int(predictions[0].argmax(0))
        expected = target
    return predicted, expected

In [14]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

#load the model
cnn = AudioCNN()
state_dict = torch.load("/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/saved_models/program3_2.pth")
cnn.load_state_dict(state_dict)
cnn = cnn.to(device)

#load the validation dataset
VAL_LABELS_FILE="/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/val/labels.txt"
VAL_WAV_DIR="/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/val"

#5 second audio signals
SAMPLE_RATE=16000
NUM_SAMPLES= 80000

#instantiate transformer
mel_spectogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

#instantiate dataset object
val_dataset = AudioDataset(VAL_WAV_DIR, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, device, VAL_LABELS_FILE)

#make inferences on labels
true = 0
for i in range(len(val_dataset)):
    input, target = val_dataset[i][0], val_dataset[i][1]
    #print(input)
    #print(target)
    input.unsqueeze_(0)
    predicted, expected = predict(cnn, input, target)
    print(f"Predicted: '{predicted}', Expected: '{expected}'")
    if predicted == expected:
        true+=1


print(f"True samples: {true}, All samples: {len(val_dataset)}, Percentage: {true/len(val_dataset)}")
    

Predicted: '25', Expected: '23'
Predicted: '15', Expected: '15'
Predicted: '3', Expected: '6'
Predicted: '22', Expected: '16'
Predicted: '20', Expected: '20'
Predicted: '4', Expected: '5'
Predicted: '7', Expected: '14'
Predicted: '2', Expected: '4'
Predicted: '1', Expected: '1'
Predicted: '25', Expected: '25'
Predicted: '19', Expected: '22'
Predicted: '4', Expected: '21'
Predicted: '8', Expected: '9'
Predicted: '14', Expected: '7'
Predicted: '5', Expected: '10'
Predicted: '1', Expected: '8'
Predicted: '10', Expected: '10'
Predicted: '25', Expected: '18'
Predicted: '8', Expected: '8'
Predicted: '24', Expected: '7'
Predicted: '19', Expected: '19'
Predicted: '12', Expected: '12'
Predicted: '7', Expected: '2'
Predicted: '7', Expected: '3'
Predicted: '6', Expected: '6'
Predicted: '1', Expected: '8'
Predicted: '5', Expected: '19'
Predicted: '21', Expected: '21'
Predicted: '16', Expected: '2'
Predicted: '4', Expected: '16'
Predicted: '1', Expected: '1'
Predicted: '1', Expected: '23'
Predicted

## Inference

In [15]:
def predict_test(model, input):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        predicted = int(predictions[0].argmax(0))
    return predicted

In [16]:
PREDICTIONS_PATH = "/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/predictions.txt"

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

#load the model
cnn = AudioCNN()
state_dict = torch.load("/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/saved_models/program3_2.pth")
cnn.load_state_dict(state_dict)
cnn = cnn.to(device)

#5 second audio signals
SAMPLE_RATE=16000
NUM_SAMPLES= 80000

#instantiate transformer
mel_spectogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

#load the test dataset
TEST_WAV_DIR="/WAVE/archive/users/emitchell/CSEN342W25/CSEN342P3/test"
test_dataset = AudioDataset(TEST_WAV_DIR, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, device)

format_lines = []

#print(test_dataset[i])

for i in range(75):
    input = test_dataset[i]
    input.unsqueeze_(0)
    predicted = predict_test(cnn, input)
    format_lines.append(str(predicted))

with open(PREDICTIONS_PATH, "w") as output_file:
        output_file.write("\n".join(format_lines))