In [1]:
pip install librosa soundfile numpy scikit-learn pyaudio

In [2]:
# Import necessary libraries
import soundfile
import os
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import librosa
import librosa.display
import pickle

In [3]:
def extract_feature(file_name, mfcc, chroma, mel, tonnetz, pitch, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate, rms, intonation, rhythm):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma:
            stft = np.abs(librosa.stft(X))
        result = np.array([])

        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))

        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))

        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))

        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
            result = np.hstack((result, tonnetz))

        if pitch:
            pitches, magnitudes = librosa.core.piptrack(y=X, sr=sample_rate)
            pitch_mean = np.mean(librosa.core.pitch_tuning(pitches))
            result = np.hstack((result, pitch_mean))

        if spectral_centroid:
            centroid = np.mean(librosa.feature.spectral_centroid(y=X, sr=sample_rate))
            result = np.hstack((result, centroid))

        if spectral_bandwidth:
            bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate))
            result = np.hstack((result, bandwidth))

        if spectral_rolloff:
            rolloff = np.mean(librosa.feature.spectral_rolloff(y=X, sr=sample_rate))
            result = np.hstack((result, rolloff))

        if zero_crossing_rate:
            zero_crossings = np.mean(librosa.feature.zero_crossing_rate(y=X))
            result = np.hstack((result, zero_crossings))

        if rms:
            rms_value = np.mean(librosa.feature.rms(y=X))
            result = np.hstack((result, rms_value))
            
        if intonation:
            pitches, magnitudes = librosa.core.piptrack(y=X, sr=sample_rate)
            pitch_mean = np.mean(librosa.core.pitch_tuning(pitches))
            pitch_std = np.std(librosa.core.pitch_tuning(pitches))
            result = np.hstack((result, pitch_mean, pitch_std))

        if rhythm:
            onset_env = librosa.onset.onset_strength(y=X, sr=sample_rate)
            tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sample_rate)
            result = np.hstack((result, tempo))
    

    return result


In [4]:
# Emotions in the RAVDESS dataset
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
} 

# Emotions to observe
observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

In [5]:
# Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x, y = [], []  # Fix the initialization of y
    for file in glob.glob("C:\\Users\\admin\\Desktop\\ravdess data\\Actor_*\\*.wav"):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True, tonnetz=True, pitch=True, spectral_centroid=True, spectral_bandwidth=True, spectral_rolloff=True, zero_crossing_rate=True, rms=True, intonation=True, rhythm=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [6]:
# Split the dataset
x_train, x_test, y_train, y_test = load_data(test_size=0.25)

In [7]:
# Standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
# Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

In [9]:
# Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.1],
    'hidden_layer_sizes': [(500,)],
    'max_iter': [1000]
}

grid_search = GridSearchCV(MLPClassifier(learning_rate='adaptive', batch_size=256, epsilon=1e-08), param_grid, cv=5)
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model
model = grid_search.best_estimator_

In [11]:
# Train the model
model.fit(x_train, y_train)

In [12]:
# Predict for the test set
y_pred = model.predict(x_test)

In [13]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

In [None]:
# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

In [None]:
# Save the model to a file
pickle.dump(model, open('model.pkl', 'wb'))