In [None]:
from google.colab import drive
import os
import torch
from torchsummary import summary
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import librosa.display
import matplotlib.pyplot as plt
import torch.nn.functional as F
import numpy as np
import random
import librosa
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score


# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Change working directory to folder location
os.chdir('/content/drive/MyDrive/Thesis_Material')

In [None]:
#Teacher Architecture: Code from Cretois et al. (2022)
class VGG11(nn.Module):
    def __init__(self, T=5.0):
        super().__init__()
        self.T = T

        # First set of conv layers -> depth of 64
        self.conv11 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.bn11  = nn.BatchNorm2d(64)
        
        # Second set of conv layers -> from depth 64 to depth 128
        self.conv21 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn21  = nn.BatchNorm2d(128)
        
        # Third set of conv layers -> from depth 128 to depth 256
        self.conv31 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn31  = nn.BatchNorm2d(256)
        self.conv32 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.bn32  = nn.BatchNorm2d(256)
                      
        # Fourth set of conv layers -> from depth 128 to depth 256
        self.conv41 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.bn41  = nn.BatchNorm2d(512)
        self.conv42 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.bn42  = nn.BatchNorm2d(512)
        
        # Fifth set of conv layers -> from depth 128 to depth 256
        self.conv51 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.bn51  = nn.BatchNorm2d(512)
        self.conv52 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.bn52  = nn.BatchNorm2d(512)
              
        # First FC layer
        self.fc1 = nn.Linear(4 * 4 * 512,  4096)
        # Second FC layer
        self.fc2 = nn.Linear( 4096,  4096)
        
        # Add a dropout layer
        self.dropout = nn.Dropout(p=0.5)
        
        # Output
        self.fc3 = nn.Linear(4096, 1)
        self.sigmoid = nn.Sigmoid()
        
     

    def forward(self, x):

        # MaxPool for the first block --> img from 128x128 to 64x64
        out = F.max_pool2d(torch.relu(self.bn11(self.conv11(x))), 2)

        # MaxPool for the first block --> img from 64x64 to 32x32
        out = F.max_pool2d(torch.relu(self.bn21(self.conv21(out))), 2)

        # MaxPool for the first block --> img from 32x32 to 16x16
        out = torch.relu(self.bn31(self.conv31(out)))
        out = F.max_pool2d(torch.relu(self.bn32(self.conv32(out))), 2)
        
        # MaxPool for the first block --> img from 16x16 to 8x8
        out = torch.relu(self.bn41(self.conv41(out)))
        out = F.max_pool2d(torch.relu(self.bn42(self.conv42(out))), 2)
        
        # MaxPool for the first block --> img from 8x8 to 4x4
        out = torch.relu(self.bn51(self.conv51(out)))
        out = F.max_pool2d(torch.relu(self.bn52(self.conv52(out))), 2)
        
        # Flatten the whole thing: image of 4 x 4 * 512 
        out = out.view(-1, 4 * 4 * 512)
        out = self.dropout(torch.relu(self.fc1(out)))
        out = self.dropout(torch.relu(self.fc2(out)))

        out = self.sigmoid(self.fc3(out))
       
        return(out)

In [None]:
#Generate Mel-Spec
def generate_mel_spectrogram(x, sr, show=False, resize=True):
    sgram = librosa.stft(x, n_fft=1024, hop_length=376)
    sgram_mag, _ = librosa.magphase(sgram)
    mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sr, n_mels=128)
    mel_sgram = librosa.amplitude_to_db(mel_scale_sgram)
    if resize:
        # Crop the mel spectrogram to 128x128
        mel_sgram = mel_sgram[:, :128]
    if show:
        librosa.display.specshow(mel_sgram, sr=sr, x_axis='time', y_axis='mel')
        plt.colorbar(format='%+2.0f dB')
    return mel_sgram

#Dataset
class SpeechDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.labels[idx]
        
        if self.transform:
            x = self.transform(x)

        x = torch.tensor(x)
        return x, torch.tensor(y).unsqueeze(-1)
        
    
  
# Load data
def load_data(data_path):
    speech_dir = os.path.join(data_path, 'speech')
    no_speech_dir = os.path.join(data_path, 'no_speech')

    speech_files = [os.path.join(speech_dir, f) for f in os.listdir(speech_dir) if f.endswith('.wav')]
    no_speech_files = [os.path.join(no_speech_dir, f) for f in os.listdir(no_speech_dir) if f.endswith('.wav')]

    data = []

    for file in speech_files + no_speech_files:
        x, sr = librosa.load(file)
        mel_sgram = generate_mel_spectrogram(x, sr)
        data.append(mel_sgram)
    
    labels = [1] * len(speech_files) + [0] * len(no_speech_files)

    return data, labels

In [None]:

# Load teacher model and weights
teacher = VGG11()

Train_Data ='/content/drive/MyDrive/Thesis_Material/Synthetic_Dataset'
# Load  data
data, labels = load_data(Train_Data) 

# Split data into train, val, and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=42)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.25, random_state=42)

# SpeechDataset instances of training, validation, testing
train_dataset = SpeechDataset(train_data, train_labels, transform=ToTensor())
val_dataset = SpeechDataset(val_data, val_labels, transform=ToTensor())
test_dataset = SpeechDataset(test_data, test_labels, transform=ToTensor()) 

# DataLoaders for training, validation, testing
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last=True)

In [None]:
#Training instantiation:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set hyperparameters
num_epochs = 50
T = 5.0

torch.manual_seed(18)
model = VGG11().to(device)

# loss function & optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
  
        optimizer.zero_grad()

        outputs = model(inputs)
       
        loss = criterion(outputs, targets.float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_outputs = []
    all_targets = []

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)
        

            outputs = model(inputs)
            loss = criterion(outputs, targets.float())

            running_loss += loss.item()
            all_outputs.append(outputs.detach().cpu().numpy())
            all_targets.append(targets.detach().cpu().numpy())

    all_outputs = np.concatenate(all_outputs)
    all_targets = np.concatenate(all_targets)

    auc = roc_auc_score(all_targets, all_outputs)
    f1 = f1_score(all_targets, np.round(all_outputs))

    return running_loss / len(dataloader), auc, f1

# Train and evaluate model
patience = 3
no_improvement_count = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_auc, val_f1 = evaluate_model(model, val_loader, criterion, device)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}, Val F1: {val_f1:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improvement_count = 0
    else:
        no_improvement_count += 1
        if no_improvement_count >= patience:
            print(f'Early stopping after {epoch + 1} epochs')
            break

# Test model
test_loss, test_auc, test_f1 = evaluate_model(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test AUC: {test_auc:.4f}, Test F1: {test_f1:.4f}')

  x = torch.tensor(x)


Epoch 1, Train Loss: 11.1267, Val Loss: 24.7683, Val AUC: 0.5553, Val F1: 0.6624


  x = torch.tensor(x)


Epoch 2, Train Loss: 5.5901, Val Loss: 10.4496, Val AUC: 0.6381, Val F1: 0.5577


  x = torch.tensor(x)


Epoch 3, Train Loss: 3.0244, Val Loss: 18.1219, Val AUC: 0.8130, Val F1: 0.7984


  x = torch.tensor(x)


Epoch 4, Train Loss: 0.7759, Val Loss: 1.1855, Val AUC: 0.9797, Val F1: 0.9340


  x = torch.tensor(x)


Epoch 5, Train Loss: 0.5406, Val Loss: 0.6426, Val AUC: 0.9619, Val F1: 0.8938


  x = torch.tensor(x)


Epoch 6, Train Loss: 0.8545, Val Loss: 0.3618, Val AUC: 0.9860, Val F1: 0.9653


  x = torch.tensor(x)


Epoch 7, Train Loss: 0.4065, Val Loss: 0.3690, Val AUC: 0.9861, Val F1: 0.9703


  x = torch.tensor(x)


Epoch 8, Train Loss: 0.3661, Val Loss: 0.6661, Val AUC: 0.9795, Val F1: 0.9444


  x = torch.tensor(x)


Epoch 9, Train Loss: 0.5866, Val Loss: 0.8128, Val AUC: 0.9504, Val F1: 0.8741
Early stopping after 9 epochs


  x = torch.tensor(x)


Test Loss: 0.6505, Test AUC: 0.9453, Test F1: 0.8739


In [None]:
#model evaulation 
# Test model
test_loss, test_auc, test_f1 = evaluate_model(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test AUC: {test_auc:.4f}, Test F1: {test_f1:.4f}')

  x = torch.tensor(x)


Test Loss: 0.6505, Test AUC: 0.9453, Test F1: 0.8739
