In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from torchaudio.datasets import SPEECHCOMMANDS
import os

# 1. Data Preprocessing
class SpeechDataset(Dataset):
    def __init__(self, subset):
        self.dataset = SPEECHCOMMANDS(root=".", download=True, subset=subset)

        self.audio_transforms = torchaudio.transforms.MFCC(
            sample_rate=16000,
            n_mfcc=40
        )

        self.labels = sorted(list(set(datapoint[2] for datapoint in self.dataset)))
        self.label_to_index = {label: index for index, label in enumerate(self.labels)}

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        waveform, sample_rate, label, _, _ = self.dataset[idx]

        # Extract MFCC features
        mfcc = self.audio_transforms(waveform)

        # Get label index
        label_index = self.label_to_index[label]

        return mfcc, label_index

# 2. Model Architecture
class SpeechRecognitionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SpeechRecognitionModel, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        # x shape: (batch, time, features)
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])  # Take the last time step
        return output

# 3. Training Function
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

# 4. Evaluation Function
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)

            total_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    return total_loss / len(test_loader), correct / len(test_loader.dataset)

# 5. Main Training Loop
def main():
    # Hyperparameters
    input_dim = 40  # MFCC features
    hidden_dim = 256
    learning_rate = 0.001
    num_epochs = 10
    batch_size = 32

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Create dataset and dataloaders
    try:
        train_dataset = SpeechDataset("training")
        test_dataset = SpeechDataset("testing")

        num_classes = len(train_dataset.labels)
        output_dim = num_classes

        print(f"Number of classes: {num_classes}")
        print(f"Number of training samples: {len(train_dataset)}")
        print(f"Number of test samples: {len(test_dataset)}")

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

    except Exception as e:
        print(f"Error loading dataset: {str(e)}")
        return

    # Initialize model
    model = SpeechRecognitionModel(input_dim, hidden_dim, output_dim).to(device)
    print(model)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training history
    train_losses = []
    test_losses = []
    accuracies = []

    # Training loop
    try:
        for epoch in range(num_epochs):
            train_loss = train(model, train_loader, criterion, optimizer, device)
            test_loss, accuracy = evaluate(model, test_loader, criterion, device)

            train_losses.append(train_loss)
            test_losses.append(test_loss)
            accuracies.append(accuracy)

            print(f'Epoch {epoch+1}/{num_epochs}:')
            print(f'Train Loss: {train_loss:.4f}')
            print(f'Test Loss: {test_loss:.4f}')
            print(f'Accuracy: {accuracy:.4f}')

    except Exception as e:
        print(f"Error during training: {str(e)}")
        return

    # Plotting results
    try:
        plt.figure(figsize=(12, 4))

        plt.subplot(1, 2, 1)
        plt.plot(train_losses, label='Train Loss')
        plt.plot(test_losses, label='Test Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(accuracies)
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')

        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Error plotting results: {str(e)}")

if __name__ == '__main__':
    main()

Using device: cpu


100%|██████████| 2.26G/2.26G [00:28<00:00, 86.4MB/s]


Number of classes: 35
Number of training samples: 84843
Number of test samples: 11005
SpeechRecognitionModel(
  (lstm): LSTM(40, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=35, bias=True)
)
Error during training: LSTM: Expected input to be 2D or 3D, got 4D instead
