In [42]:
import torchaudio
import torchaudio.transforms as T
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

mel_transform = T.MelSpectrogram(
    sample_rate=16000,
    n_fft=1024,
    win_length=1024,
    hop_length=256,
    n_mels=80,
    f_min=0,
    f_max=8000,
    power=1.0,
    normalized=True,
)

In [43]:
AtoZ={
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    'e': 4,
    'f': 5,
    'g': 6,
    'h': 7,
    'i': 8,
    'j': 9,
    'k': 10,
    'l': 11,
    'm': 12,
    'n': 13,
    'o': 14,
    'p': 15,
    'q': 16,
    'r': 17,
    's': 18,
    't': 19,
    'u': 20,
    'v': 21,
    'w': 22,
    'x': 23,
    'y': 24,
    'z': 25,
    " ": 26,
}
def get_label_from_text(text:str)->list:
    """
    Convert text to a list of integers.
    """
    text = text.lower()
    return [AtoZ[char] for char in text if char in AtoZ]


In [44]:
get_label_from_text("hello world")

[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]

In [45]:

class AudioDataset(Dataset):
    def __init__(self, audio_paths, labels):
        self.audio_paths = audio_paths
        self.labels = labels

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        def load_audio(file_path):
            waveform, sample_rate = torchaudio.load(file_path)
            return waveform, sample_rate
        waveform, _= load_audio(self.audio_paths[idx])
        mel = mel_transform(waveform)
        label = torch.tensor(get_label_from_text(self.labels[idx]), dtype=torch.long)
        return mel.squeeze(0), label

    

In [46]:
import pandas as pd
df= pd.read_csv("dataset/en/validated.tsv", sep="\t")
df["path"]= "dataset/en/clips/"+df["path"]
df["sentence"]= df["sentence"].astype(str)
X=df["path"].tolist()
Y=df["sentence"].tolist()

In [47]:
train_data = AudioDataset(audio_paths=X, labels=Y)
test_data = AudioDataset(audio_paths=X[:10], labels=Y[:10])

In [73]:

arr=[]
for i,j in train_data:
    arr.append(j.shape)
max([i for i in arr])

torch.Size([99])

In [48]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes=27):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.fc1 = nn.Linear(64 * 20 * 40, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=(2, 2))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=(2, 2))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    

In [52]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=AudioClassifier().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
epochs=10
for epoch in range(epochs):
    for i,(mel,label) in enumerate(train_data):
        mel=mel.to(device)
        label=label.to(device)
        optimizer.zero_grad()
        output=model(mel)
        
        loss=criterion(output,label)
        loss.backward()
        optimizer.step()
    print(f"epoch{epoch}")

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [80, 599]