In [1]:
import torchaudio
import torchaudio.transforms as T
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

mel_transform = T.MelSpectrogram(
    sample_rate=16000,
    n_fft=1024,
    win_length=1024,
    hop_length=256,
    n_mels=80,
    f_min=0,
    f_max=8000,
    power=1.0,
    normalized=True,
)

In [19]:
AtoZ={
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    'e': 4,
    'f': 5,
    'g': 6,
    'h': 7,
    'i': 8,
    'j': 9,
    'k': 10,
    'l': 11,
    'm': 12,
    'n': 13,
    'o': 14,
    'p': 15,
    'q': 16,
    'r': 17,
    's': 18,
    't': 19,
    'u': 20,
    'v': 21,
    'w': 22,
    'x': 23,
    'y': 24,
    'z': 25,
    " ": 26,
}
def get_label_from_text(text:str)->list:
    """
    Convert text to a list of integers.
    """
    text = text.lower()
    return [AtoZ[char] for char in text if char in AtoZ]


In [20]:
get_label_from_text("hello world")

[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]

In [22]:

class AudioDataset(Dataset):
    def __init__(self, audio_paths, labels):
        self.audio_paths = audio_paths
        self.labels = labels

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        def load_audio(file_path):
            waveform, sample_rate = torchaudio.load(file_path)
            return waveform, sample_rate
        waveform, _= load_audio(self.audio_paths[idx])
        mel = mel_transform(waveform)
        label = torch.tensor(get_label_from_text(self.labels[idx]), dtype=torch.long)
        return mel.squeeze(0), label

    

In [14]:
import pandas as pd
df= pd.read_csv("dataset/en/validated.tsv", sep="\t")
df["path"]= "dataset/en/clips/"+df["path"]
df["sentence"]= df["sentence"].astype(str)
X=df["path"].tolist()
Y=df["sentence"].tolist()

In [23]:
train_data = AudioDataset(audio_paths=X, labels=Y)
train_data[0]

(tensor([[1.2611e-05, 5.1856e-05, 1.5952e-04,  ..., 5.2414e-03, 2.6059e-03,
          1.1178e-02],
         [1.4391e-05, 3.4026e-05, 3.6697e-05,  ..., 1.1884e-02, 9.2271e-03,
          1.0617e-02],
         [4.8470e-06, 6.8994e-06, 6.1866e-06,  ..., 1.8028e-02, 1.4053e-02,
          1.9148e-02],
         ...,
         [1.8270e-07, 1.9154e-07, 2.7454e-07,  ..., 5.7450e-07, 1.3754e-04,
          7.9204e-04],
         [2.1495e-07, 2.6207e-07, 2.8719e-07,  ..., 4.7664e-07, 1.3295e-04,
          7.6554e-04],
         [1.5080e-07, 2.1374e-07, 2.3231e-07,  ..., 5.4438e-07, 1.3185e-04,
          7.5922e-04]]),
 tensor([19,  7,  4, 26, 14, 20, 19,  4, 17, 26, 17,  8, 12, 26,  7,  0, 18, 26,
         20, 13,  3,  4, 17,  6, 14, 13,  4, 26, 18, 14, 12,  4, 26,  4, 17, 14,
         18,  8, 14, 13, 26,  3, 20,  4, 26, 19, 14, 26, 18, 20,  1, 18,  4, 16,
         20,  4, 13, 19, 26,  8, 12, 15,  0,  2, 19, 18]))