In [142]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import torchaudio
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint

In [143]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "0"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # This can help diagnose memory issues

In [160]:
# Define a vocabulary for character-level tokenization
vocab = {'<pad>': 0, '<start>': 1, '<end>': 2}
for char in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ":
    vocab[char] = len(vocab)

# Hyperparameters
num_classes = len(vocab)  # Vocabulary size
batch_size = 2  # Reduced from 16 to 2
learning_rate = 3e-4
num_epochs = 100
max_seq_length = 300

In [193]:
class SpeechToTextModel(nn.Module):
    def __init__(self, num_classes):
        super(SpeechToTextModel, self).__init__()
        self.conv1 = nn.Conv2d(2, 32, kernel_size=(3, 3), padding=(1, 1))
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1))
        self.fc1 = nn.Linear(64 * (max_seq_length // 16) * (1 * (max_seq_length // 16)), 256)  # Adjusted input size for fc1
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.relu1(self.conv1(x))
        x = self.maxpool(x)
        x = self.relu2(self.conv2(x))
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        print(x.size())
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [195]:
class CustomSTTDataset(Dataset):
    def __init__(self, csv_path, audio_dir, max_seq_length):
        self.data = pd.read_csv(csv_path)
        self.audio_dir = audio_dir
        self.max_seq_length = max_seq_length

    def __getitem__(self, idx):
        audio_path = os.path.join(self.audio_dir, self.data.iloc[idx]['Video Matching'])
        waveform, sample_rate = torchaudio.load(audio_path)
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono
        transcription = self.data.iloc[idx]['Text']

        # Tokenize the transcription to integers
        tokens = [vocab.get(char, vocab['<pad>']) for char in transcription]

        # Pad the sequence to the maximum length
        if waveform.size(1) < self.max_seq_length:
            padding = torch.zeros((waveform.size(0), self.max_seq_length - waveform.size(1)))
            waveform = torch.cat((waveform, padding), dim=1)

        return waveform, tokens

    def __len__(self):
        return len(self.data)

In [196]:
# Create the model and move it to the GPU (if available)
model = SpeechToTextModel(num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

# Loss function and optimizer
criterion = nn.CTCLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create the dataset with a specified maximum sequence length
dataset = CustomSTTDataset(csv_path="TEXT/AUDIO.csv", audio_dir="AUDIO", max_seq_length=max_seq_length)

def collate_fn(batch):
    # Find the maximum length of any waveform in this batch
    max_length = max(waveform.size(1) for waveform, _ in batch)

    # Pad all waveforms and targets to the same length
    waveforms = []
    targets = []

    for waveform, tokens in batch:
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono
        input_length = waveform.size(1)
        target_length = len(tokens)

        # Pad inputs to the maximum length (max_length)
        if input_length < max_length:
            padding = torch.zeros((waveform.size(0), max_length - input_length), device=waveform.device)
            waveform = torch.cat((waveform, padding), dim=1)

        # Pad targets to the maximum length (max_length)
        if target_length < max_length:
            tokens += [vocab['<pad>']] * (max_length - target_length)

        waveforms.append(waveform)
        targets.extend(tokens)  # Use extend to flatten the list of targets

    return torch.stack(waveforms), targets

# Use DataLoader with the modified dataset and the new collate_fn
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

NVIDIA GeForce RTX 3070 Ti


In [194]:
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(device)

        # Calculate input_lengths and target_lengths for the CTCLoss
        input_lengths = torch.LongTensor([inputs.size(2)] * inputs.size(0)).to(device)
        target_lengths = torch.LongTensor([len(targets)] * inputs.size(0)).to(device)  # Use len(targets)

        # Convert targets to tensor
        targets = torch.tensor(targets).to(device)

        # Pass the padded sequences through your model
        outputs = model(inputs)

        optimizer.zero_grad()
        loss = criterion(outputs, targets, input_lengths, target_lengths)
        loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(dataloader)}], Loss: {loss.item():.4f}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x40897 and 87616x512)

In [12]:
# Save the trained model
torch.save(model.state_dict(), "stt_model.pth")