In [9]:
import torch
import torch.nn as nn
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder


Loading our training model

In [13]:
from train_emotion_model import EmotionCNN
model = EmotionCNN(numclasses=8)
model.load_state_dict(torch.load("emotion_cnn.pth",weights_only=True))  #loading saved weights
model.eval()

EmotionCNN(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=6720, out_features=128, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=128, out_features=8, bias=True)
)

Defining feature extractor

In [14]:
def extract_features(file_path,max_len=174):
    # Load audio file
    audio, sample_rate = librosa.load(file_path, sr=22050)
    # Extract 40 MFCC features
    mfcc=librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    # Normalize the MFCCs
    mfcc=(mfcc - np.mean(mfcc)) / np.std(mfcc)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

Preparing input

In [15]:
file_path="Emotionalspeech/Actor_04/03-01-03-01-02-01-04.wav" #new test audio file
mfcc = extract_features(file_path)
input_tensor=torch.tensor(mfcc).unsqueeze(0).unsqueeze(0).float()#Reshape to (1,1,40,time_steps)


Predicting output and decoding label

In [16]:
output = model(input_tensor)
predicted_label = torch.argmax(output, dim=1).item()

le = LabelEncoder()
le.classes_ = np.array(['angry','calm','disgust','fearful','happy','neutral','sad','surprised'])

print("Predicted Emotion:", le.inverse_transform([predicted_label])[0])


Predicted Emotion: happy
