In [1]:
import numpy as np
# import pandas as pd
import librosa
# import tensorflow as tf
# from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import torch
import torchaudio
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
x=np.load("data/features/x_1_3.npy")
y=np.load("data/features/y_1_3.npy")

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Training Data Shape:", x_train.shape)
print("Test Data Shape:", x_test.shape)

Training Data Shape: (117052, 43, 39)
Test Data Shape: (29264, 43, 39)


In [6]:
input_shape= (43,39)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class SpeakerCNN(nn.Module):
    def __init__(self, input_shape, no_speakers, dropout_rate=0.7):
        super(SpeakerCNN, self).__init__()
        
        # Unpack input shape dimensions
        self.time_frames, self.mfcc_features = input_shape  # Example: input_shape=(43,39)

        # First convolution block
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2)

        # Second convolution block
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2)

        # Third convolution block
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2)

        # Calculate the output shape after 3 pooling layers
        conv_out_time = self._calculate_output_dim(self.time_frames, 3)
        conv_out_features = self._calculate_output_dim(self.mfcc_features, 3)
        flattened_dim = 128 * conv_out_time * conv_out_features

        # Dropout for regularization (tunable)
        self.dropout = nn.Dropout(dropout_rate)

        # Fully connected layers
        self.fc1 = nn.Linear(flattened_dim, 256)
        self.fc2 = nn.Linear(256, no_speakers)

    def _calculate_output_dim(self, size, num_pools):
        # Calculates the output dimension after a series of MaxPool2d(2)
        for _ in range(num_pools):
            size = size // 2
        return size

    def forward(self, x):
        # x expected shape: [batch_size, time_frames, mfcc_features]
        x = x.unsqueeze(1)  # add channel dimension

        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)

        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)

        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool3(x)

        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [15]:
model =SpeakerCNN(input_shape,no_speakers=51)

In [16]:
print(model)

SpeakerCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.7, inplace=False)
  (fc1): Linear(in_features=2560, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=51, bias=True)
)


In [17]:
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [18]:
batch_size = 128
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [20]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
# Early stopping setup
patience = 20
best_val_loss = float('inf')
epochs_no_improve = 0
num_epochs = 200

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(device)
model.to(device)

cuda


SpeakerCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=2560, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=51, bias=True)
)

In [22]:
model.load_state_dict(torch.load("best_model_state_3_sec.pth"))

<All keys matched successfully>

In [28]:
x_test_tensor[3],y_test_tensor[3]

(tensor([[ 0.1953, -1.0747, -1.3090,  ...,  1.5689,  0.4800,  1.6789],
         [ 0.4530, -0.9632, -0.5165,  ...,  1.5689,  0.4800,  1.6789],
         [ 0.5668, -0.5311,  0.2347,  ...,  1.5689,  0.4800,  1.6789],
         ...,
         [-1.5110, -1.3076,  0.0838,  ...,  0.2059, -0.9821, -1.8322],
         [-1.6919, -1.1217,  0.0286,  ..., -0.8351, -0.4039, -1.0808],
         [-1.7599, -1.0059,  0.2803,  ..., -1.2967,  0.1420, -0.0318]]),
 tensor(14))

In [31]:
model.to(device)
model.eval()

SpeakerCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.7, inplace=False)
  (fc1): Linear(in_features=2560, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=51, bias=True)
)

In [33]:
val_loss = 0
correct = 0

In [None]:
for inputs, labels in valid_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    val_loss += loss.item() * inputs.size(0)
    preds = outputs.argmax(dim=1)
    correct += (preds == labels).sum().item()

epoch_val_loss = val_loss / len(test_dataset)
epoch_val_acc = correct / len(test_dataset)

# Print epoch info
print(f"Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_val_acc:.4f}")

Val Loss: 0.5142 | Val Acc: 0.8246


: 