In [None]:
from datasets import load_dataset

dataset = load_dataset("narad/ravdess")


In [None]:
import pandas as pd
import numpy as np

# Convert the 'train' split to a pandas DataFrame
df = dataset['train'].to_pandas()
df['file_path'] = df['audio'].apply(lambda x: x['path'])

df.drop(["speaker_id", "speaker_gender", "audio", "text"], axis=1, inplace=True)

df.to_csv("ravdess_train.csv")

df = pd.read_csv("ravdess_train.csv")


In [None]:
import torch, torchaudio

def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform.squeeze().numpy()

In [None]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

In [None]:
lengths = []

class RAVDESSDataset(torch.utils.data.Dataset):
    def __init__(self, df, feature_extractor, max_length=246000):
        self.df = df
        self.feature_extractor = feature_extractor
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.df.iloc[idx]['file_path']
        emotion = self.df.iloc[idx]['labels']
        audio = load_audio(file_path)
        
        audio = audio.flatten()

        # Pad or truncate the audio
        if len(audio) > self.max_length:
            audio = audio[:self.max_length]
        else:
            padding = np.zeros(self.max_length - len(audio))
            audio = np.concatenate([audio, padding])

        inputs = self.feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
        return {
            'input_values': inputs.input_values.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'label': torch.tensor(emotion)
        }

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = RAVDESSDataset(train_df, feature_extractor)
val_dataset = RAVDESSDataset(val_df, feature_extractor)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)


In [None]:
from transformers import Trainer
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()
# from torch.utils.data import DataLoader

# traindataloader = DataLoader(train_dataset)

# for data in traindataloader:
#     pass


In [None]:
eval_results = trainer.evaluate()
print(f"Validation Accuracy: {eval_results['eval_accuracy']:.4f}")

trainer.save_model("./ravdess_emotion_recognition_model")

np.array(lengths).max()


In [None]:
model = Wav2Vec2ForSequenceClassification.from_pretrained("ravdess_emotion_recognition_model/")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)