# Speech part

In [None]:
# Supervised Fine-Tuning on RAVDESS (Speech-Only)

import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
from torch.utils.data import DataLoader, Dataset
import os
import librosa
import numpy as np

# Dataset Class for RAVDESS
def augment_audio(audio, sr):
    # SpecAugment: Time Masking + Frequency Masking
    audio_tensor = torch.tensor(audio)
    time_masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
    freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=30)
    augmented_audio = time_masking(audio_tensor.unsqueeze(0))
    augmented_audio = freq_masking(augmented_audio).squeeze(0)
    return augmented_audio.numpy()

class RAVDESSDataset(Dataset):
    def __init__(self, data_dir, processor):
        self.files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.wav')]
        self.labels = [int(f.split('-')[2]) - 1 for f in os.listdir(data_dir) if f.endswith('.wav')]
        self.processor = processor
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        audio, sr = librosa.load(self.files[idx], sr=16000)
        audio = augment_audio(audio, sr)
        inputs = self.processor(audio, return_tensors="pt", sampling_rate=16000).input_values
        return inputs.squeeze(0), torch.tensor(self.labels[idx])

# Training Function
def train_speech_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=8).to(device)
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
    dataset = RAVDESSDataset("data/processed/RAVDESS", processor)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
    
    optimizer = optim.AdamW(model.parameters(), lr=3e-5)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    for epoch in range(10):
        total_loss = 0
        for batch, labels in dataloader:
            batch, labels = batch.to(device), labels.to(device)
            outputs = model(batch).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        scheduler.step()
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader)}")
    
    torch.save(model.state_dict(), "models/fine_tuned_speech_model.pth")
    print("Supervised Fine-Tuning Completed.")

if __name__ == "__main__":
    train_speech_model()


# Multi-modal part

In [None]:
# Multimodal Emotion Recognition (Speech + Text)

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import Wav2Vec2Model, BertModel, Wav2Vec2Processor, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import os
import librosa
import pandas as pd
import numpy as np

# Dataset Class for Multimodal Training
class MultimodalDataset(Dataset):
    def __init__(self, audio_dir, text_path, processor, tokenizer):
        self.audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')]
        self.transcripts = pd.read_csv(text_path)
        self.labels = self.transcripts['label'].values
        self.processor = processor
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        audio, sr = librosa.load(self.audio_files[idx], sr=16000)
        audio_inputs = self.processor(audio, return_tensors="pt", sampling_rate=16000).input_values
        text_inputs = self.tokenizer(self.transcripts.iloc[idx]['text'], return_tensors="pt", padding=True, truncation=True, max_length=128)
        return audio_inputs.squeeze(0), text_inputs['input_ids'].squeeze(0), text_inputs['attention_mask'].squeeze(0), torch.tensor(self.labels[idx])

# Multimodal Model (Wav2Vec2 + BERT + Cross-Modal Attention)
class MultimodalEmotionModel(nn.Module):
    def __init__(self):
        super(MultimodalEmotionModel, self).__init__()
        self.audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
        self.text_model = BertModel.from_pretrained("bert-base-uncased")
        self.cross_attention = nn.MultiheadAttention(embed_dim=768, num_heads=8)
        self.fc = nn.Linear(768, 8)
    
    def forward(self, audio_input, text_input, text_mask):
        audio_features = self.audio_model(audio_input).last_hidden_state[:, 0, :]
        text_features = self.text_model(text_input, attention_mask=text_mask).last_hidden_state[:, 0, :]
        combined_features, _ = self.cross_attention(audio_features.unsqueeze(1), text_features.unsqueeze(1), text_features.unsqueeze(1))
        logits = self.fc(combined_features.squeeze(1))
        return logits

# Training Function
def train_multimodal_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalEmotionModel().to(device)
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    dataset = MultimodalDataset("data/processed/audio", "data/processed/text.csv", processor, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
    
    optimizer = optim.AdamW(model.parameters(), lr=3e-5)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    for epoch in range(10):
        total_loss = 0
        for audio_batch, text_batch, text_mask, labels in dataloader:
            audio_batch, text_batch, text_mask, labels = audio_batch.to(device), text_batch.to(device), text_mask.to(device), labels.to(device)
            outputs = model(audio_batch, text_batch, text_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        scheduler.step()
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader)}")
    
    torch.save(model.state_dict(), "models/multimodal_emotion_model.pth")
    print("Multimodal Training Completed.")

if __name__ == "__main__":
    train_multimodal_model()
