# Assignment 1
## Training neural network
### classify audio as 'apple', 'orange', 'cherry', 'unknown'

In [22]:
# libraries
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio.transforms as T
import librosa
import numpy as np
import soundfile as sf
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd

### UTILS

In [23]:
class AudioDataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        
        # Define label mapping
        self.label_map = {'apple': 0, 'orange': 1, 'cherry': 2, 'unknown': 3}
        
        with open(label_file, 'r') as f:
            data = json.load(f)
        
        self.file_names = [item['path'] for item in data]
        self.labels = [item['label'] for item in data]
    
    def __len__(self):
        return len(self.file_names)
    
    def __getitem__(self, idx):
        file_path = os.path.join(self.data_dir, self.file_names[idx])
        label = self.labels[idx]
        
        # Convert string label to integer label
        label = self.label_map[label]
        
        waveform, sample_rate = sf.read(file_path, dtype='float32')
        waveform = torch.tensor(waveform).unsqueeze(0)  # Add channel dimension
        
        if self.transform:
            waveform = self.transform(waveform)
        
        return waveform, label


In [24]:
class M5(nn.Module):
    def __init__(self, num_classes):
        super(M5, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=80, stride=16)
        self.bn1 = nn.BatchNorm1d(32)
        self.pool1 = nn.MaxPool1d(4)
        
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(64)
        self.pool2 = nn.MaxPool1d(4)
        
        self.conv3 = nn.Conv1d(64, 128, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool3 = nn.MaxPool1d(4)
        
        self.fc1 = None  # We will dynamically calculate the size of the flattened layer

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        
        x = x.view(x.size(0), -1)  # Flatten the tensor
        
        # Dynamically calculate the input size for the fully connected layer
        if self.fc1 is None:
            flattened_size = x.size(1)  # Get the flattened size
            self.fc1 = nn.Linear(flattened_size, 4)  # 4 output classes (apple, orange, cherry, unknown)
            self.add_module("fc1", self.fc1)  # Add the fc1 layer to the module

        x = self.fc1(x)  # Pass through the fully connected layer
        return x


### DATASET

In [25]:
# structure training label files
train_label_path = 'data/training/info.labels'
train_label_data = pd.read_json(train_label_path)

df_train = train_label_data.copy()
df_train = pd.json_normalize(df_train["files"])

df_train = df_train.drop(columns=["label.type"])
df_train = df_train.rename(columns={"label.label":"label"})

# save json file
df_train = df_train.to_json("data/train_labels.json", orient='records', indent=4) 


# structure testing label files
test_label_path = 'data/testing/info.labels'
test_label_data = pd.read_json(test_label_path)

df_test = test_label_data.copy()
df_test = pd.json_normalize(df_test["files"])

df_test = df_test.drop(columns=["label.type"])
df_test = df_test.rename(columns={"label.label":"label"})

# save json file
df_test = df_test.to_json("data/test_labels.json", orient='records', indent=4) 

In [26]:
# Load dataset
train_dataset = AudioDataset('data/training', 'data/train_labels.json')
test_dataset = AudioDataset('data/testing', 'data/test_labels.json')

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

### Model Training

In [27]:
# Initialize model
num_classes = 4  # apple, orange, cherry, unknown
model = M5(num_classes)
# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
def train(model, loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_samples = 0
        
        for inputs, labels in loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)  # Get the index of the max log-probability
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)
        
        accuracy = 100 * correct_predictions / total_samples
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(loader)}, Accuracy: {accuracy:.2f}%")
  

In [29]:
def evaluate(model, loader):
    model.eval()
    correct_predictions = 0
    total_samples = 0
    
    with torch.no_grad():
        for inputs, labels in loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    
    accuracy = 100 * correct_predictions / total_samples
    print(f"Test Accuracy: {accuracy:.2f}%")

In [None]:
# Train the model
train(model, train_loader, criterion, optimizer, epochs=50)

Epoch 1, Loss: 1.331907373208266, Accuracy: 36.50%
Epoch 2, Loss: 1.091023550583766, Accuracy: 58.50%
Epoch 3, Loss: 0.9489156099466177, Accuracy: 69.00%
Epoch 4, Loss: 0.8679219576028677, Accuracy: 77.00%
Epoch 5, Loss: 0.8187251274402325, Accuracy: 84.00%
Epoch 6, Loss: 0.7409789837323703, Accuracy: 89.50%
Epoch 7, Loss: 0.6464042801123399, Accuracy: 90.00%
Epoch 8, Loss: 0.6097729939680833, Accuracy: 94.50%
Epoch 9, Loss: 0.5601476385043218, Accuracy: 96.00%
Epoch 10, Loss: 0.5188101621774527, Accuracy: 95.50%
Epoch 11, Loss: 0.4841405978569618, Accuracy: 95.00%
Epoch 12, Loss: 0.44619798545654005, Accuracy: 97.00%
Epoch 13, Loss: 0.46614824579312253, Accuracy: 97.50%
Epoch 14, Loss: 0.42082299292087555, Accuracy: 97.00%
Epoch 15, Loss: 0.36672885028215557, Accuracy: 98.00%
Epoch 16, Loss: 0.3709248992112967, Accuracy: 98.50%
Epoch 17, Loss: 0.3063592945153897, Accuracy: 98.50%
Epoch 18, Loss: 0.310408639220091, Accuracy: 98.50%
Epoch 19, Loss: 0.27286877769690293, Accuracy: 99.00%


### Evaluation

In [31]:
# Evaluate the model on the test set
evaluate(model, test_loader)

Test Accuracy: 80.00%
