# 🎯 **Step 0: Import library**
---

In [1]:
# !pip -q install torch_xla[tpu] -f https://storage.googleapis.com/tpu-pytorch/wheels/colab.html
# !pip -q install optuna
# !pip -q install gradio
!pip -q install wandb
!pip -q install datasets
!pip -q install evaluate
!pip -q install huggingface_hub
!pip -q install transformers[torch]

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import librosa
import evaluate
import wandb
# import gradio as gr
from datasets import load_from_disk

from datasets import Dataset
from IPython.display import Audio
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback

# 🎯 **Step 1: Authentication to Huggingface**
---

In [4]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [5]:
from huggingface_hub import login
login(hf_token)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 🎯 **Step 2: Authentication to WanDB**
---

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mfirdhoworking[0m ([33mfirdhoworking-sepuluh-nopember-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


# 🎯 **Step 3: Mount Google Drive**
---

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 🎯 **Step 4: Load and Prepare Dataset**
---

In [None]:
path =  '/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/df_combined.csv'

df_voice = pd.read_csv(path)

df_voice.head()

Unnamed: 0,Path,Emotion
0,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
1,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
2,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
3,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
4,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,calm


In [None]:
df_voice['Emotion'].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
sad,752
happy,752
angry,752
neutral,716
disgust,652
fearful,652
surprised,652
calm,192


In [None]:
df_voice = df_voice[df_voice['Emotion'] != 'calm']

In [None]:
tags = np.unique(df_voice['Emotion']) # get unique category
num_tags = len(tags) # get the number of category, here we have 2 tags/categories
label2id = {t: i for i, t in enumerate(tags)} # make a dictionary to map label to id
id2label = {i: t for i, t in enumerate(tags)} # make a dictionary to map id to label

In [None]:
label2id, id2label

({'angry': 0,
  'disgust': 1,
  'fearful': 2,
  'happy': 3,
  'neutral': 4,
  'sad': 5,
  'surprised': 6},
 {0: 'angry',
  1: 'disgust',
  2: 'fearful',
  3: 'happy',
  4: 'neutral',
  5: 'sad',
  6: 'surprised'})

# 🎯 **Step 5: Split Dataset into Train and Test**
---

In [None]:
train_data = df_voice.sample(frac=0.8, random_state=42)

train_data.head()

Unnamed: 0,Path,Emotion
1857,/content/drive/MyDrive/Audio/Speech_TESS/OAF_h...,neutral
3784,/content/drive/MyDrive/Audio/Speech_TESS/YAF_r...,sad
2918,/content/drive/MyDrive/Audio/Speech_TESS/YAF_b...,disgust
2572,/content/drive/MyDrive/Audio/Speech_TESS/OAF_s...,surprised
780,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral


In [None]:
test_data = df_voice.drop(train_data.index)

test_data.head()

Unnamed: 0,Path,Emotion
3,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,neutral
12,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy
13,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy
17,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy
18,/content/drive/MyDrive/Audio/Speech_RAVDESS/Ac...,happy


In [None]:
X_train = Dataset.from_pandas(train_data)
X_test = Dataset.from_pandas(test_data)

# 🎯 **Step 6: Read Audio and Convert Dataset**
---

In [None]:
import numpy as np
import librosa
from datasets import Dataset

# Fungsi untuk membaca audio dan mengubahnya menjadi array menggunakan librosa
def read_audio(path):
    # Membaca audio menggunakan librosa
    array, sampling_rate = librosa.load(path, sr=None)
    return array, sampling_rate

# Fungsi untuk mengonversi data dalam dataset menjadi format yang diinginkan
def convert_example(example):
    audio_path = example['Path']
    array, sampling_rate = read_audio(audio_path)
    return {
        'audio': {
            'path': audio_path,
            'array': array,
            'sampling_rate': sampling_rate
        },
        'labels': label2id[example['Emotion']]
    }

# Fungsi untuk mengonversi seluruh dataset
def convert_dataset(dataset):
    converted_examples = []
    for example in dataset:
        converted_example = convert_example(example)
        converted_examples.append(converted_example)
    return Dataset.from_dict(converted_examples)

converted_train = X_train.map(convert_example)
converted_test = X_test.map(convert_example)

Map:   0%|          | 0/3942 [00:00<?, ? examples/s]

Map:   0%|          | 0/986 [00:00<?, ? examples/s]

In [None]:
converted_train, converted_test

(Dataset({
     features: ['Path', 'Emotion', '__index_level_0__', 'audio', 'labels'],
     num_rows: 3942
 }),
 Dataset({
     features: ['Path', 'Emotion', '__index_level_0__', 'audio', 'labels'],
     num_rows: 986
 }))

In [None]:
converted_train = converted_train.remove_columns(["Path", "Emotion", "__index_level_0__"])
converted_test = converted_test.remove_columns(["Path", "Emotion", "__index_level_0__"])

In [None]:
converted_train, converted_test

(Dataset({
     features: ['audio', 'labels'],
     num_rows: 3942
 }),
 Dataset({
     features: ['audio', 'labels'],
     num_rows: 986
 }))

In [None]:
converted_train.save_to_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_train')
converted_test.save_to_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_test')

Saving the dataset (0/4 shards):   0%|          | 0/3942 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/986 [00:00<?, ? examples/s]

In [None]:
converted_train = load_from_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_train')
converted_test = load_from_disk('/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/converted_test')

# 🎯 **Step 7: Load Pretrained Whisper Model**
---

In [None]:
# model_id = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
model_id = "openai/whisper-large-v3"

feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [None]:
sample = converted_train[0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: -3.36e-05, Variance: 0.000273


# 🎯 **Step 8: Preprocess the Audio Data**
---

In [None]:
max_duration = 30.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )
    return inputs

In [None]:
data_encoded_train = converted_train.map(
    preprocess_function,
    remove_columns=["audio"],
    batched=True,
    batch_size=100,
    num_proc=1,
)

data_encoded_train

Dataset({
    features: ['labels', 'input_features'],
    num_rows: 3942
})

In [None]:
data_encoded_test = converted_test.map(
    preprocess_function,
    remove_columns=["audio"],
    batched=True,
    batch_size=100,
    num_proc=1,
)

data_encoded_test

Dataset({
    features: ['labels', 'input_features'],
    num_rows: 986
})

In [None]:
num_labels = len(id2label)

# 🎯 **Step 9: Hyperparameter Tuning with Optuna (optional)**
---

In [None]:
# def model_init():
#     return AutoModelForAudioClassification.from_pretrained(
#         model_id,
#         num_labels=num_labels,
#         label2id=label2id,
#         id2label=id2label,
#     )

In [None]:
# def hp_space_optuna(trial):
#     return {
#         "learning_rate": trial.suggest_categorical("learning_rate", [1e-2, 1e-3, 1e-4, 5e-2, 5e-3, 5e-4]),
#         "gradient_accumulation_steps": trial.suggest_int("gradient_accumulation_steps", 1, 5),
#         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8, 16]),
#         "per_device_eval_batch_size": trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8, 16]),
#         "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.3),
#         "weight_decay": trial.suggest_float("weight_decay", 0.00, 0.3),
#         "logging_steps": trial.suggest_int("logging_steps", 5, 50, step=5),
#         "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "constant", "constant_with_warmup"])  # Tune lr scheduler type
#     }

In [None]:
# training_args = TrainingArguments(
#     # f"speech-emotion-recognition-with-wav2vec2-xlsr",
#     output_dir="/content/drive/MyDrive/SPEECH EMOTION RECOGNITION/TUNING",
#     num_train_epochs=3,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     fp16=True,
#     push_to_hub=False
# )

In [None]:
# def compute_metrics(eval_pred):
#     predictions = np.argmax(eval_pred.predictions, axis=1)
#     labels = eval_pred.label_ids

#     accuracy = accuracy_score(labels, predictions)

#     precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

#     return {
#         'accuracy': accuracy,
#         'precision': precision,
#         'recall': recall,
#         'f1': f1
#     }

In [None]:
# trainer = Trainer(
#     model_init=model_init,
#     args=training_args,
#     train_dataset=data_encoded_train,
#     eval_dataset=data_encoded_test,
#     tokenizer=feature_extractor,
#     compute_metrics=compute_metrics,
# )

In [None]:
# best_run = trainer.hyperparameter_search(
#     direction="maximize",
#     hp_space=hp_space_optuna,
#     n_trials=10,
#     backend="optuna",
#     compute_objective=lambda metrics: metrics["accuracy"],
#     study_name="speech-emotion-recognition-with-wav2vec2-xlsr",
# )

In [None]:
# print(f"Best hyperparameters: {best_run.hyperparameters}")

In [None]:
# best_hyperparameters = best_run.hyperparameters

In [None]:
# training_args = TrainingArguments(
#     f"speech-emotion-recognition-with-wav2vec2-xlsr",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=best_hyperparameters['learning_rate'],
#     gradient_accumulation_steps=best_hyperparameters['gradient_accumulation_steps'],
#     per_device_train_batch_size=best_hyperparameters['per_device_train_batch_size'],
#     num_train_epochs=best_hyperparameters['num_train_epochs'],
#     warmup_ratio=best_hyperparameters['warmup_ratio'],
#     logging_steps=best_hyperparameters['logging_steps'],
#     lr_scheduler_type=best_hyperparameters['lr_scheduler_type'],
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     evaluation_strategy='epoch',
#     save_strategy='epoch',
#     load_best_model_at_end=True,
#     push_to_hub=True,
#     fp16=True
# )

In [None]:
# trainer = Trainer(
#     model_init=model_init,
#     args=training_args,
#     train_dataset=data_encoded_train,
#     eval_dataset=data_encoded_test,
#     tokenizer=feature_extractor,
#     compute_metrics=compute_metrics,
# )

In [None]:
# trainer.train()

In [None]:
# evaluation_results = trainer.evaluate()
# print(evaluation_results)

# 🎯 **Step 10: Initialize Model for Audio Classification**
---

In [None]:
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-large-v3 and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.bias', 'model.projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
wandb.init(project="speech-emotion-recognition-with-whisper")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfirdhoworking[0m ([33mfirdhoworking-sepuluh-nopember-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


# 🎯 **Step 11: Set Up Training Arguments**
---

In [None]:
batch_size = 2  # Updated batch size
gradient_accumulation_steps = 5  # Updated gradient accumulation steps
num_train_epochs = 25  # Updated number of epochs

training_args = TrainingArguments(
    f"speech-emotion-recognition-with-openai-whisper-large-v3",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    lr_scheduler_type="linear",
    report_to="wandb",
    push_to_hub=True
)



# 🎯 **Step 12: Set Up Optimizer and LR Scheduler**
---

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=len(train_data) * training_args.num_train_epochs)

# 🎯 **Step 13: Define Metrics for Evaluation**
---

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids

    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 🎯 **Step 14: Initialize Trainer and Start Training**
---

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=data_encoded_train,
    eval_dataset=data_encoded_test,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.4948,0.491117,0.8286,0.844875,0.8286,0.830209
1,0.6271,0.530747,0.822515,0.85595,0.822515,0.827671
2,0.2364,0.50758,0.869168,0.872654,0.869168,0.868435
3,0.0156,0.566859,0.873225,0.886819,0.873225,0.874537
5,0.0112,0.470067,0.910751,0.915914,0.910751,0.91142
6,0.0013,0.523163,0.913793,0.920424,0.913793,0.913685
7,0.1894,0.500845,0.919878,0.922969,0.919878,0.919849
8,0.0877,0.551722,0.913793,0.915193,0.913793,0.913813
10,0.0026,0.833375,0.877282,0.894921,0.877282,0.876995


Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'begi

TrainOutput(global_step=4336, training_loss=0.21827457035219747, metrics={'train_runtime': 21170.2062, 'train_samples_per_second': 4.655, 'train_steps_per_second': 0.465, 'total_flos': 6.347817440084885e+19, 'train_loss': 0.21827457035219747, 'epoch': 10.99949264332826})

# 🎯 **Step 14: Evaluate the Model and Push to Huggingface**
---

In [None]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.5008450746536255, 'eval_accuracy': 0.9198782961460447, 'eval_precision': 0.9229688914731574, 'eval_recall': 0.9198782961460447, 'eval_f1': 0.9198486992428615, 'eval_runtime': 250.7265, 'eval_samples_per_second': 3.933, 'eval_steps_per_second': 1.966, 'epoch': 10.99949264332826}


In [None]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}


CommitInfo(commit_url='https://huggingface.co/firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3/commit/ede9acef7d7ddecac806bce677640371b95cb186', commit_message='End of training', commit_description='', oid='ede9acef7d7ddecac806bce677640371b95cb186', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

VBox(children=(Label(value='0.025 MB of 0.025 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁▄▅▇▇███▆▅█
eval/f1,▁▁▄▅▇▇███▆▅█
eval/loss,▂▂▂▃▁▁▂▂▃▃█▂
eval/precision,▁▂▃▅▇▇██▇▆▅█
eval/recall,▁▁▄▅▇▇███▆▅█
eval/runtime,▁▃▄▄▅▅▅▅▅▆█▆
eval/samples_per_second,█▆▅▅▄▄▄▄▄▃▁▃
eval/steps_per_second,█▆▅▅▄▄▄▄▄▃▁▃
train/epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
eval/accuracy,0.91988
eval/f1,0.91985
eval/loss,0.50085
eval/precision,0.92297
eval/recall,0.91988
eval/runtime,250.7265
eval/samples_per_second,3.933
eval/steps_per_second,1.966
total_flos,6.347817440084885e+19
train/epoch,10.99949


# 🎯 **Step 15: Inference the Model**
---

In [7]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import torch
import numpy as np

model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
model = AutoModelForAudioClassification.from_pretrained(model_id)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.55G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [8]:
id2label = model.config.id2label
id2label

{0: 'angry',
 1: 'disgust',
 2: 'fearful',
 3: 'happy',
 4: 'neutral',
 5: 'sad',
 6: 'surprised'}

In [9]:
def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
    audio_array, sampling_rate = librosa.load(audio_path, sr=feature_extractor.sampling_rate)

    max_length = int(feature_extractor.sampling_rate * max_duration)
    if len(audio_array) > max_length:
        audio_array = audio_array[:max_length]
    else:
        audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))

    inputs = feature_extractor(
        audio_array,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    return inputs

In [10]:
def predict_emotion(audio_path, model, feature_extractor, id2label, max_duration=30.0):
    inputs = preprocess_audio(audio_path, feature_extractor, max_duration)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = id2label[predicted_id]

    return predicted_label

In [14]:
audio_path = "/content/drive/MyDrive/Audio/Speech_TESS/OAF_back_sad.wav"

predicted_emotion = predict_emotion(audio_path, model, feature_extractor, id2label)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: sad
