In [1]:
import os
import librosa
import pandas as pd
import numpy as np
import torch
from torch import nn
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)
print(torch.cuda.get_device_name(0))

audio_dir = "./AUDIO"
csv_file = "./TEXT/AUDIO.csv"

audio_files = os.listdir(audio_dir)

x_train = []
y_train = []

df = pd.read_csv(csv_file)

Device:  cuda
NVIDIA GeForce RTX 3070 Ti


In [3]:
for file in audio_files:
    if not file.endswith(".mp3"):
        continue

    file_path = os.path.join(audio_dir, file)

    y, sr = librosa.load(file_path, sr=None, mono=True)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc = np.transpose(mfcc, (1, 0))

    x_train.append(torch.tensor(mfcc))

    matched_text = df.loc[df['Video Matching'] == file, 'Text'].values[0]

    y_train.append(matched_text)

    print(f"Processed {file}")

Processed Audio1.mp3
Processed Audio10.mp3
Processed Audio11.mp3
Processed Audio12.mp3
Processed Audio13.mp3
Processed Audio14.mp3
Processed Audio15.mp3
Processed Audio16.mp3
Processed Audio17.mp3
Processed Audio18.mp3
Processed Audio19.mp3
Processed Audio2.mp3
Processed Audio20.mp3
Processed Audio21.mp3
Processed Audio22.mp3
Processed Audio23.mp3
Processed Audio24.mp3
Processed Audio25.mp3
Processed Audio26.mp3
Processed Audio27.mp3
Processed Audio28.mp3
Processed Audio29.mp3
Processed Audio3.mp3
Processed Audio30.mp3
Processed Audio31.mp3
Processed Audio32.mp3
Processed Audio33.mp3
Processed Audio34.mp3
Processed Audio35.mp3
Processed Audio36.mp3
Processed Audio37.mp3
Processed Audio38.mp3
Processed Audio39.mp3
Processed Audio4.mp3
Processed Audio40.mp3
Processed Audio41.mp3
Processed Audio42.mp3
Processed Audio43.mp3
Processed Audio44.mp3
Processed Audio45.mp3
Processed Audio46.mp3
Processed Audio47.mp3
Processed Audio48.mp3
Processed Audio49.mp3
Processed Audio5.mp3
Processed Audio

In [7]:
characters = list(set(char for label in y_train for char in label))
characters.append('<PAD>')

print("Number of characters: ", len(characters))
print("Characters: ", characters)

char_to_id = {char: id for id, char in enumerate(characters)}
id_to_char = {id: char for char, id in char_to_id.items()}

y_train_ids = [[char_to_id[char] for char in label] for label in y_train]
max_len = max(max(len(mfcc) for mfcc in x_train), max(len(label) for label in y_train_ids))

y_train_padded_ids = pad_sequences(y_train_ids, maxlen=max_len, padding='post', value=char_to_id['<PAD>'])
y_train_padded_ids = y_train_padded_ids.reshape(-1, 1)

onehot_encoder = OneHotEncoder()
onehot_encoder.fit(np.array(list(id_to_char.keys())).reshape(-1, 1))

y_train_onehot = onehot_encoder.transform(y_train_padded_ids).toarray()
y_train_onehot_padded = pad_sequences(y_train_onehot, maxlen=max_len, padding='post')

x_train_padded = pad_sequences(x_train, maxlen=max_len, padding='post')

print("x_train_padded.shape: ", x_train_padded.shape)
print("y_train_onehot_padded.shape: ", y_train_onehot_padded.shape)

Number of characters:  55
Characters:  ['u', 'S', 'n', '3', 'W', ':', 'I', 'M', 'y', 'b', '0', 'l', 'f', 'x', 'p', '-', 'o', '"', 'h', 'H', '1', 'g', 'Y', 'A', 'J', 'q', 'N', 'v', '?', 'O', 'r', ' ', 'B', 'i', 'j', 'R', 'G', 'a', 't', 'e', '_', ',', 'P', 'm', 'E', 'k', 'c', 'd', "'", '.', 'T', 'w', '4', 's', '<PAD>']
x_train_padded.shape:  (50, 2147, 13)
y_train_onehot_padded.shape:  (107350, 2147)


In [9]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [10]:
model = Model(13, 64, len(characters), num_layers=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [15]:
x_train_tensor = torch.tensor(x_train_padded).to(device)
y_train_tensor = torch.tensor(y_train_onehot_padded).to(device)

In [16]:
encoder = OneHotEncoder()
encoder.fit(y_train_padded_ids)

y_train_onehot = [encoder.transform(label.reshape(-1, 1)).toarray() for label in y_train_padded_ids]
y_train_onehot_padded = np.stack(y_train_onehot)
new_batch_size = 10000
y_train_onehot_padded_subset = y_train_onehot_padded[:new_batch_size]

In [22]:
epoch_amount = 100

for epoch in range(epoch_amount):
    # Load the model
    if os.path.exists("model.pth"):
        model.load_state_dict(torch.load("model.pth"))

    outputs = model(x_train_tensor.float())
    outputs = outputs.float()

    y_train_tensor = y_train_tensor.view(-1).long()

    if outputs.shape[0] != y_train_tensor.shape[0]:
        y_train_tensor = y_train_tensor[:outputs.shape[0]]

    print("outputs.shape: ", outputs.shape)

    loss = criterion(outputs, y_train_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    torch.save(model.state_dict(), "model.pth")

    outputs = outputs.argmax(1)

    # Get 20 of the audio files to test and check the accuracy
    test_files = audio_files[:40]

    x_test = []
    y_test = []

    for file in test_files:
        if not file.endswith(".mp3"):
            continue

        file_path = os.path.join(audio_dir, file)

        y, sr = librosa.load(file_path, sr=None, mono=True)

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc = np.transpose(mfcc, (1, 0))

        x_test.append(torch.tensor(mfcc))

        matched_text = df.loc[df['Video Matching'] == file, 'Text'].values[0]

        y_test.append(matched_text)

    y_test_ids = [[char_to_id[char] for char in label] for label in y_test]
    y_test_padded_ids = pad_sequences(y_test_ids, maxlen=max_len, padding='post', value=char_to_id['<PAD>'])
    y_test_padded_ids = y_test_padded_ids.reshape(-1, 1)

    y_test_onehot = onehot_encoder.transform(y_test_padded_ids).toarray()
    y_test_onehot_padded = pad_sequences(y_test_onehot, maxlen=max_len, padding='post')

    x_test_padded = pad_sequences(x_test, maxlen=max_len, padding='post')

    x_test_tensor = torch.tensor(x_test_padded).to(device)
    y_test_tensor = torch.tensor(y_test_onehot_padded).to(device)

    # Accuracy calculation for the test set
    
    # Get Audio and Text
    outputs = model(x_test_tensor.float())
    outputs = outputs.float()

    y_test_tensor = y_test_tensor.view(-1).long()

    if outputs.shape[0] != y_test_tensor.shape[0]:
        y_test_tensor = y_test_tensor[:outputs.shape[0]]

    outputs = outputs.argmax(1)

    correct = 0
    total = 0

    for i in range(len(outputs)):
        if outputs[i] == y_test_tensor[i]:
            correct += 1
        total += 1

    print("correct: ", correct)
    print("total: ", total)

    print("Accuracy: ", correct / total)

    print(f"Epoch: {epoch + 1}/{epoch_amount}, Loss: {loss.item()}")

outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 1/100, Loss: 0.09763100743293762
outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 2/100, Loss: 0.09763094037771225
outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 3/100, Loss: 0.09763094037771225
outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 4/100, Loss: 0.09763102233409882
outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 5/100, Loss: 0.09763098508119583
outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 6/100, Loss: 0.09763099998235703
outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 7/100, Loss: 0.09763094782829285
outputs.shape:  torch.Size([50, 55])
correct:  39
total:  40
Accuracy:  0.975
Epoch: 8/100, Loss: 0.09763097018003464
outputs.shape:  torch.Size([50, 55])
correct:  39
total:

KeyboardInterrupt: 

In [12]:
torch.save(model.state_dict(), "model.pth")