# 🎯 **Step 0: Import library**
---

In [1]:
# !pip -q install torch_xla[tpu] -f https://storage.googleapis.com/tpu-pytorch/wheels/colab.html
# !pip -q install optuna
# !pip -q install gradio
!pip -q install wandb
!pip -q install datasets
!pip -q install evaluate
!pip -q install huggingface_hub
!pip -q install transformers[torch]

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import librosa
import evaluate
import wandb
# import gradio as gr
from datasets import load_from_disk

from datasets import Dataset
from IPython.display import Audio
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback

# 🎯 **Step 1: Authentication to Huggingface**
---

In [3]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [4]:
from huggingface_hub import login
login(hf_token)

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 🎯 **Step 2: Authentication to WanDB**
---

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# 🎯 **Step 3: Mount Google Drive**
---

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 🎯 **Step 4: Load and Prepare Dataset**
---

In [None]:
path =  '/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/df_combined.csv'

df_voice = pd.read_csv(path)

df_voice.head()

Unnamed: 0,Path,Emotion
0,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
1,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
2,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
3,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
4,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,calm


In [None]:
df_voice['Emotion'].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
sad,752
happy,752
angry,752
neutral,716
disgust,652
fearful,652
surprised,652
calm,192


In [None]:
df_voice = df_voice[df_voice['Emotion'] != 'calm']

In [None]:
tags = np.unique(df_voice['Emotion']) # get unique category
num_tags = len(tags) # get the number of category, here we have 2 tags/categories
label2id = {t: i for i, t in enumerate(tags)} # make a dictionary to map label to id
id2label = {i: t for i, t in enumerate(tags)} # make a dictionary to map id to label

In [None]:
label2id, id2label

({'angry': 0,
  'disgust': 1,
  'fearful': 2,
  'happy': 3,
  'neutral': 4,
  'sad': 5,
  'surprised': 6},
 {0: 'angry',
  1: 'disgust',
  2: 'fearful',
  3: 'happy',
  4: 'neutral',
  5: 'sad',
  6: 'surprised'})

# 🎯 **Step 5: Split Dataset into Train and Test**
---

In [None]:
train_data = df_voice.sample(frac=0.8, random_state=42)

train_data.head()

Unnamed: 0,Path,Emotion
1857,/content/drive/MyDrive/Audio/Speech_TESS/OAF_h...,neutral
3784,/content/drive/MyDrive/Audio/Speech_TESS/YAF_r...,sad
2918,/content/drive/MyDrive/Audio/Speech_TESS/YAF_b...,disgust
2572,/content/drive/MyDrive/Audio/Speech_TESS/OAF_s...,surprised
780,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral


In [None]:
test_data = df_voice.drop(train_data.index)

test_data.head()

Unnamed: 0,Path,Emotion
3,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
12,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy
13,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy
17,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy
18,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy


In [None]:
X_train = Dataset.from_pandas(train_data)
X_test = Dataset.from_pandas(test_data)

# 🎯 **Step 6: Read Audio and Convert Dataset**
---

In [None]:
import numpy as np
import librosa
from datasets import Dataset

def read_audio(path):
    array, sampling_rate = librosa.load(path, sr=None)
    return array, sampling_rate

def convert_example(example):
    audio_path = example['Path']
    array, sampling_rate = read_audio(audio_path)
    return {
        'audio': {
            'path': audio_path,
            'array': array,
            'sampling_rate': sampling_rate
        },
        'labels': label2id[example['Emotion']]
    }

def convert_dataset(dataset):
    converted_examples = []
    for example in dataset:
        converted_example = convert_example(example)
        converted_examples.append(converted_example)
    return Dataset.from_dict(converted_examples)

converted_train = X_train.map(convert_example)
converted_test = X_test.map(convert_example)

Map:   0%|          | 0/3942 [00:00<?, ? examples/s]

Map:   0%|          | 0/986 [00:00<?, ? examples/s]

In [None]:
converted_train, converted_test

(Dataset({
     features: ['Path', 'Emotion', '__index_level_0__', 'audio', 'labels'],
     num_rows: 3942
 }),
 Dataset({
     features: ['Path', 'Emotion', '__index_level_0__', 'audio', 'labels'],
     num_rows: 986
 }))

In [None]:
converted_train = converted_train.remove_columns(["Path", "Emotion", "__index_level_0__"])
converted_test = converted_test.remove_columns(["Path", "Emotion", "__index_level_0__"])

In [None]:
converted_train, converted_test

(Dataset({
     features: ['audio', 'labels'],
     num_rows: 3942
 }),
 Dataset({
     features: ['audio', 'labels'],
     num_rows: 986
 }))

In [None]:
converted_train.save_to_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_train')
converted_test.save_to_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_test')

Saving the dataset (0/4 shards):   0%|          | 0/3942 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/986 [00:00<?, ? examples/s]

In [None]:
converted_train = load_from_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_train')
converted_test = load_from_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_test')

# 🎯 **Step 7: Load Pretrained Wav2Vec2 Model**
---

In [None]:
# model_id = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
model_id = "facebook/wav2vec2-large-xlsr-53"

feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [None]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [None]:
sample = converted_train[0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: -3.36e-05, Variance: 0.000273


# 🎯 **Step 8: Preprocess the Audio Data**
---

In [None]:
max_duration = 30.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [None]:
data_encoded_train = converted_train.map(
    preprocess_function,
    remove_columns=["audio"],
    batched=True,
    batch_size=100,
    num_proc=1,
)

data_encoded_train

Dataset({
    features: ['labels', 'input_values', 'attention_mask'],
    num_rows: 3942
})

In [None]:
data_encoded_test = converted_test.map(
    preprocess_function,
    remove_columns=["audio"],
    batched=True,
    batch_size=100,
    num_proc=1,
)

data_encoded_test

Dataset({
    features: ['labels', 'input_values', 'attention_mask'],
    num_rows: 986
})

In [None]:
num_labels = len(id2label)

# 🎯 **Step 9: Hyperparameter Tuning with Optuna (optional)**
---

In [None]:
# def model_init():
#     return AutoModelForAudioClassification.from_pretrained(
#         model_id,
#         num_labels=num_labels,
#         label2id=label2id,
#         id2label=id2label,
#     )

In [None]:
# def hp_space_optuna(trial):
#     return {
#         "learning_rate": trial.suggest_categorical("learning_rate", [1e-2, 1e-3, 1e-4, 5e-2, 5e-3, 5e-4]),
#         "gradient_accumulation_steps": trial.suggest_int("gradient_accumulation_steps", 1, 5),
#         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8, 16]),
#         "per_device_eval_batch_size": trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8, 16]),
#         "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.3),
#         "weight_decay": trial.suggest_float("weight_decay", 0.00, 0.3),
#         "logging_steps": trial.suggest_int("logging_steps", 5, 50, step=5),
#         "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "constant", "constant_with_warmup"])  # Tune lr scheduler type
#     }

In [None]:
# training_args = TrainingArguments(
#     # f"speech-emotion-recognition-with-wav2vec2-xlsr",
#     output_dir="/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/TUNING",
#     num_train_epochs=3,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     fp16=True,
#     push_to_hub=False
# )

In [None]:
# def compute_metrics(eval_pred):
#     predictions = np.argmax(eval_pred.predictions, axis=1)
#     labels = eval_pred.label_ids

#     accuracy = accuracy_score(labels, predictions)

#     precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

#     return {
#         'accuracy': accuracy,
#         'precision': precision,
#         'recall': recall,
#         'f1': f1
#     }

In [None]:
# trainer = Trainer(
#     model_init=model_init,
#     args=training_args,
#     train_dataset=data_encoded_train,
#     eval_dataset=data_encoded_test,
#     tokenizer=feature_extractor,
#     compute_metrics=compute_metrics,
# )

In [None]:
# best_run = trainer.hyperparameter_search(
#     direction="maximize",
#     hp_space=hp_space_optuna,
#     n_trials=10,
#     backend="optuna",
#     compute_objective=lambda metrics: metrics["accuracy"],
#     study_name="speech-emotion-recognition-with-wav2vec2-xlsr",
# )

In [None]:
# print(f"Best hyperparameters: {best_run.hyperparameters}")

In [None]:
# best_hyperparameters = best_run.hyperparameters

In [None]:
# training_args = TrainingArguments(
#     f"speech-emotion-recognition-with-wav2vec2-xlsr",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=best_hyperparameters['learning_rate'],
#     gradient_accumulation_steps=best_hyperparameters['gradient_accumulation_steps'],
#     per_device_train_batch_size=best_hyperparameters['per_device_train_batch_size'],
#     num_train_epochs=best_hyperparameters['num_train_epochs'],
#     warmup_ratio=best_hyperparameters['warmup_ratio'],
#     logging_steps=best_hyperparameters['logging_steps'],
#     lr_scheduler_type=best_hyperparameters['lr_scheduler_type'],
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     evaluation_strategy='epoch',
#     save_strategy='epoch',
#     load_best_model_at_end=True,
#     push_to_hub=True,
#     fp16=True
# )

In [None]:
# trainer = Trainer(
#     model_init=model_init,
#     args=training_args,
#     train_dataset=data_encoded_train,
#     eval_dataset=data_encoded_test,
#     tokenizer=feature_extractor,
#     compute_metrics=compute_metrics,
# )

In [None]:
# trainer.train()

In [None]:
# evaluation_results = trainer.evaluate()
# print(evaluation_results)

# 🎯 **Step 10: Initialize Model for Audio Classification**
---

In [None]:
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
wandb.init(project="speech-emotion-recognition")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfirdhoworking[0m ([33mfirdhoworking-sepuluh-nopember-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


# 🎯 **Step 11: Set Up Training Arguments**
---

In [None]:
batch_size = 2  # Updated batch size
gradient_accumulation_steps = 5  # Updated gradient accumulation steps
num_train_epochs = 25  # Updated number of epochs

training_args = TrainingArguments(
    f"speech-emotion-recognition-with-facebook-wav2vec2-large-xlsr-53",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    lr_scheduler_type="linear",
    report_to="wandb",
    push_to_hub=True
)



# 🎯 **Step 12: Set Up Optimizer and LR Scheduler**
---

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=len(train_data) * training_args.num_train_epochs)

# 🎯 **Step 13: Define Metrics for Evaluation**
---

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids

    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 🎯 **Step 14: Initialize Trainer and Start Training**
---

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=data_encoded_train,
    eval_dataset=data_encoded_test,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,1.9343,1.927676,0.250507,0.142516,0.250507,0.169117
1,1.7944,1.644614,0.457404,0.57586,0.457404,0.421299
2,1.4601,1.324239,0.595335,0.618264,0.595335,0.57094
3,1.0551,1.076364,0.662272,0.665928,0.662272,0.644733
5,1.1156,0.829178,0.74645,0.763506,0.74645,0.744191
6,0.6307,0.643945,0.80426,0.809038,0.80426,0.801971
7,0.774,0.666576,0.792089,0.811697,0.792089,0.791564
8,0.5537,0.511074,0.824544,0.826824,0.824544,0.820529
10,0.716,0.549908,0.827586,0.846485,0.827586,0.826811
11,0.5372,0.546272,0.837728,0.860602,0.837728,0.840372


  _warn_prf(average, modifier, msg_start, len(result))
No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=9850, training_loss=0.6424398968945527, metrics={'train_runtime': 19686.4884, 'train_samples_per_second': 5.006, 'train_steps_per_second': 0.5, 'total_flos': 2.4731500135938376e+19, 'train_loss': 0.6424398968945527, 'epoch': 24.987316083206494})

# 🎯 **Step 14: Evaluate the Model and Push to Huggingface**
---

In [None]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.49894458055496216, 'eval_accuracy': 0.9168356997971603, 'eval_precision': 0.9209485590714481, 'eval_recall': 0.9168356997971603, 'eval_f1': 0.9165686317462675, 'eval_runtime': 105.0503, 'eval_samples_per_second': 9.386, 'eval_steps_per_second': 4.693, 'epoch': 24.987316083206494}


In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/firdhokk/speech-emotion-recognition-with-facebook-wav2vec2-large-xlsr-53/commit/b6d28839fc0361c8f0a4e793ffb733f821e61674', commit_message='End of training', commit_description='', oid='b6d28839fc0361c8f0a4e793ffb733f821e61674', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

VBox(children=(Label(value='0.024 MB of 0.024 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▃▅▅▆▆▇▇▇▇▇▇▇██▇██████████
eval/f1,▁▃▅▅▆▆▇▇▇▇▇▇▇█████████████
eval/loss,█▇▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▁▁▂▂▂
eval/precision,▁▅▅▆▆▇▇▇▇▇▇▇██████████████
eval/recall,▁▃▅▅▆▆▇▇▇▇▇▇▇██▇██████████
eval/runtime,▇▁▂▂▃▄▄▄▄▆▅▆█▅▅▅▆▄▅▄▅▅▅▅█▆
eval/samples_per_second,▁█▇▇▆▅▅▅▅▃▄▃▁▄▄▃▃▅▄▅▄▄▄▄▁▃
eval/steps_per_second,▁█▇▇▆▅▅▅▅▃▄▃▁▄▄▃▃▅▄▅▄▄▄▄▁▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██

0,1
eval/accuracy,0.91684
eval/f1,0.91657
eval/loss,0.49894
eval/precision,0.92095
eval/recall,0.91684
eval/runtime,105.0503
eval/samples_per_second,9.386
eval/steps_per_second,4.693
total_flos,2.4731500135938376e+19
train/epoch,24.98732


# 🎯 **Step 15: Inference the Model**
---

In [37]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import torch
import numpy as np

model_id = "firdhokk/speech-emotion-recognition-with-facebook-wav2vec2-large-xlsr-53"
model = AutoModelForAudioClassification.from_pretrained(model_id)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True, return_attention_mask=True)

In [38]:
id2label = model.config.id2label
id2label

{0: 'angry',
 1: 'disgust',
 2: 'fearful',
 3: 'happy',
 4: 'neutral',
 5: 'sad',
 6: 'surprised'}

In [39]:
def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
    audio_array, sampling_rate = librosa.load(audio_path, sr=feature_extractor.sampling_rate)

    max_length = int(feature_extractor.sampling_rate * max_duration)
    if len(audio_array) > max_length:
        audio_array = audio_array[:max_length]
    else:
        audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))

    inputs = feature_extractor(
        audio_array,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=max_length,
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    return inputs

In [49]:
def predict_emotion(audio_path, model, feature_extractor, id2label, max_duration=30.0):
    inputs = preprocess_audio(audio_path, feature_extractor, max_duration)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = id2label[predicted_id]

    return predicted_label

In [50]:
audio_path = "/content/drive/MyDrive/Audio/Speech_URDU/Happy/SM5_F4_H058.wav"

predicted_emotion = predict_emotion(audio_path, model, feature_extractor, id2label)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: happy
