In [4]:
import torchaudio
import torchaudio.transforms as T
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

mel_transform = T.MelSpectrogram(
    sample_rate=16000,
    n_fft=1024,
    win_length=1024,
    hop_length=256,
    n_mels=80,
    f_min=0,
    f_max=8000,
    power=1.0,
    normalized=True,
)

In [6]:
class VoiceDataset(Dataset):
    def __init__(self,data,labels):
        self.data = data
        self.labels = labels
        self.samples= self.load_data()
    def load_data(self):
        samples = []
        for i in range(len(self.data)):
            waveform, sample_rate = torchaudio.load(self.data[i])
            mel = mel_transform(waveform)
            mel = mel.squeeze(0).transpose(0, 1)
            samples.append((mel, self.labels[i]))
        return samples
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        mel, label = self.samples[idx]
        mel = mel.unsqueeze(0)
        return mel, label

In [8]:
class VoiceClassifier(nn.Module):
    def __init__(self, num_classes):
        super(VoiceClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.fc1 = nn.Linear(64 * 20 * 40, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(p=0.5)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 64 * 20 * 40)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [14]:
import pandas as pd
df= pd.read_csv('dataset/en/validated.tsv', sep='\t')
df["sentence"] = df["sentence"].astype(str)
df["path"] = df["path"].astype(str)
x= df["path"].tolist()
y= df["sentence"].tolist()