In [None]:
# 3. Add required transform for training files
# 4. Use transfer learn for model
# 5. Inference on test files

# 1. Download dataset

In [None]:
# TODO: Get dataset
import gdown

url = 'https://drive.google.com/uc?id=1S9lZmbaPyGohZ6INfTHhcmOxD9r4DQrR'
output = '/content/denoised_dataset.zip'

gdown.download(url, output, quiet=False)


Downloading...
From (original): https://drive.google.com/uc?id=1S9lZmbaPyGohZ6INfTHhcmOxD9r4DQrR
From (redirected): https://drive.google.com/uc?id=1S9lZmbaPyGohZ6INfTHhcmOxD9r4DQrR&confirm=t&uuid=199f3d31-04c0-4c6b-b488-271c70513c54
To: /content/denoised_dataset.zip
100%|██████████| 675M/675M [00:08<00:00, 81.1MB/s]


'/content/denoised_dataset.zip'

In [None]:
# TODO : Extract denoised dataset

!unzip -q "/content/denoised_dataset.zip" -d "/content/"

In [None]:
!ls

denoised_dataset.zip  denoised_enrollments  denoised_test  sample_data


In [None]:
train_path = "/content/denoised_enrollments"
test_path = "/content/denoised_test"
trials_file_path = "/content/drive/MyDrive/signle-channel-trials.txt"

# Pretrained model weights
model_path = "/content/drive/MyDrive/Sasika/xvector_weights.pth"

In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Model Architecture

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class TDNN(nn.Module):
    def __init__(self, context_size, dilation, in_dim, out_dim):
        super(TDNN, self).__init__()

        self.context_size = context_size
        self.dilation = dilation
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.layer_context_size = context_size * dilation - dilation + 1
        self._input_x_output = f"{in_dim * context_size} x {out_dim}"

        self.kernel = nn.Linear(in_dim * context_size, out_dim)
        self.nonlinearity = nn.ReLU()

    def forward(self, x):
        assert len(x.shape) == 3
        B, T, D = x.shape
        assert (
            D == self.in_dim
        ), f"[error] Expected input dimension is {self.in_dim}, not {D}."

        x = x.unsqueeze(1)

        # Unfold input into smaller temporal contexts
        x = F.unfold(
            x,
            (self.context_size, self.in_dim),
            stride=(1, self.in_dim),
            dilation=(self.dilation, 1),
        )

        # x.shape: (N, output_dim*context_size, new_t)
        x = x.transpose(1, 2)
        x = self.kernel(x.float())
        x = self.nonlinearity(x)

        return x


In [None]:
################################################### Original Code
from pathlib import Path
import torch
import torch.nn as nn
# from .tdnn import TDNN
# from xvector_jtubespeech.network.tdnn import TDNN


# Don't use the following name (Xvector)
def XVector(model_path="xvectors_weights.pth"):
    model_not_exist_msg = (
        f"[error] dumped file of model's state dict does not exist at {model_path}"
    )
    assert Path(model_path).exists(), model_not_exist_msg

    model = _XVector(24, 1233)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    for param in model.parameters():
        param.requires_grad = False

    return model


class _XVector(nn.Module):
    def __init__(self, in_dim, classes, stat_dim=1500, hidden_dim=512):
        super(_XVector, self).__init__()

        self.stat_dim = stat_dim
        self.hidden_dim = hidden_dim

        # from Table 1. of the X-Vectors paper:
        # https://www.danielpovey.com/files/2018_icassp_xvectors.pdf
        self.frames = nn.Sequential(
            TDNN(5, 1, in_dim, hidden_dim),
            TDNN(3, 2, hidden_dim, hidden_dim),
            TDNN(3, 3, hidden_dim, hidden_dim),
            TDNN(1, 1, hidden_dim, hidden_dim),
            TDNN(1, 1, hidden_dim, stat_dim),
        )
        self.segment_6 = nn.Linear(stat_dim * 2, hidden_dim)
        self.segment_7 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, classes)

    def vectorize(self, x):
        x = self.frames(x)

        # stats-pooling
        mean = torch.mean(x, 1)
        std = torch.std(x, 1)
        x = torch.cat((mean, std), 1)

        vec = self.segment_6(x)

        return vec


In [None]:
# The architecture for my implementation
class _Xvector(nn.Module):
    def __init__(self, in_dim, classes, stat_dim=1500, hidden_dim=512):
        super(_XVector, self).__init__()

        self.stat_dim = stat_dim
        self.hidden_dim = hidden_dim

        # from Table 1. of the X-Vectors paper:
        # https://www.danielpovey.com/files/2018_icassp_xvectors.pdf
        self.frames = nn.Sequential(
            TDNN(5, 1, in_dim, hidden_dim),
            TDNN(3, 2, hidden_dim, hidden_dim),
            TDNN(3, 3, hidden_dim, hidden_dim),
            TDNN(1, 1, hidden_dim, hidden_dim),
            TDNN(1, 1, hidden_dim, stat_dim),
        )
        self.segment_6 = nn.Linear(stat_dim * 2, hidden_dim)
        self.segment_7 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, classes)

    def vectorize(self, x):
        x = self.frames(x)

        # stats-pooling
        mean = torch.mean(x, 1)
        std = torch.std(x, 1)
        x = torch.cat((mean, std), 1)

        vec = self.segment_6(x)

        return vec

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using",device)

Using cpu


In [None]:
# Load the pretrained model
model = _XVector(24, 1233)
model.load_state_dict(torch.load(model_path,map_location=device))

<All keys matched successfully>

In [None]:
import os
os.listdir()

['.config',
 'drive',
 'denoised_test',
 'denoised_dataset.zip',
 'denoised_enrollments',
 'sample_data']

In [None]:
model.state_dict();

### Transfer Learning

In [None]:
model;

In [None]:
class TransferLearningModel(nn.Module):
    def __init__(self, pre_trained_model, classes, stat_dim=1500, hidden_dim=512):
        super(TransferLearningModel, self).__init__()

        # Use the pre-trained model until the last layer
        self.pre_trained_model = pre_trained_model

        # Modify the output layer to match the number of classes in the French dataset
        self.pre_trained_model.output = nn.Linear(pre_trained_model.hidden_dim, classes)

    def forward(self, x):
        # Forward pass through the modified model
        x = self.pre_trained_model(x)
        return x


In [None]:
num_speakers = 75
num_epochs = 40
batch_size = 32

In [81]:
import os
import torch
from torch.utils.data import Dataset
from scipy.io import wavfile
import torch.nn.functional as F
import numpy as np
from torchaudio.compliance import kaldi

max_len = 2000

# Define your custom dataset class
class AudioDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.file_list = os.listdir(root_dir)
        self.transform = None

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_name = self.file_list[idx]
        # Construct the full path to the file
        file_path = os.path.join(self.root_dir, file_name)

        speaker_label = file_name.split("-")[0]

        sample_rate, waveform = wavfile.read(file_path)

        waveform = torch.from_numpy(waveform.astype(np.float32)).unsqueeze(0)

        # Check if CUDA is available and move data to GPU
        waveform = waveform.to(device)

        # Apply preprocessing (MFCC extraction)
        mfcc = kaldi.mfcc(waveform, num_ceps=24, num_mel_bins=24)  # [1, T, 24]
        mfcc = mfcc.unsqueeze(0)

        # Apply any additional transformations if needed
        if self.transform:
            mfcc = self.transform(mfcc)

        # Get the dynamic time dimension T
        T = mfcc.size(1)

        # Pad or truncate the time dimension to a fixed length (max_len)
        if T < max_len:
            # If T is less than max_len, pad with zeros
            padding = (0, max_len - T)
            mfcc_padded = F.pad(mfcc, padding, 'constant', 0)

        elif T > max_len:
            # If T is greater than max_len, truncate
            mfcc_padded = mfcc[:, :max_len, :]

        print(mfcc_padded.shape)

        return mfcc_padded, speaker_label


In [82]:
# Instantiate your _XVector model -----------------------------------------------------------
in_dim = 1  # Assuming mono audio
classes = 75  # Set the number of classes accordingly
model = _XVector(in_dim, classes)

# Instantiate your custom dataset
dataset = AudioDataset(root_dir='/content/denoised_enrollments/')

# Set your batch size and other training parameters
batch_size = 32
num_epochs = 10
learning_rate = 0.001

# Create a DataLoader -----------------------------------------------------------------------
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


# Training -----------------------------------------------------------------------------------
for epoch in range(num_epochs):
    for batch_idx, (mfcc, speaker_label) in enumerate(data_loader):
        mfcc, speaker_label = mfcc.to(device), speaker_label.to(device)

        # Forward pass
        outputs = model(mfcc)

        # Compute the loss
        loss = criterion(outputs, speaker_label)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(data_loader)}], Loss: {loss.item()}')


torch.Size([1, 1188, 836])
torch.Size([1, 318, 1706])
torch.Size([1, 327, 1697])
torch.Size([1, 591, 1433])
torch.Size([1, 1304, 720])
torch.Size([1, 324, 1700])
torch.Size([1, 362, 1662])
torch.Size([1, 1688, 336])
torch.Size([1, 383, 1641])
torch.Size([1, 326, 1698])
torch.Size([1, 1082, 942])
torch.Size([1, 1198, 826])
torch.Size([1, 319, 1705])
torch.Size([1, 345, 1679])
torch.Size([1, 330, 1694])
torch.Size([1, 315, 1709])
torch.Size([1, 1044, 980])
torch.Size([1, 413, 1611])
torch.Size([1, 368, 1656])
torch.Size([1, 341, 1683])
torch.Size([1, 1590, 434])
torch.Size([1, 373, 1651])
torch.Size([1, 367, 1657])
torch.Size([1, 1241, 783])
torch.Size([1, 380, 1644])
torch.Size([1, 401, 1623])
torch.Size([1, 1283, 741])
torch.Size([1, 1417, 607])
torch.Size([1, 371, 1653])
torch.Size([1, 575, 1449])
torch.Size([1, 451, 1573])
torch.Size([1, 1240, 784])


RuntimeError: stack expects each tensor to be equal size, but got [1, 1188, 836] at entry 0 and [1, 318, 1706] at entry 1