In [1]:
# SOURCE: https://data-flair.training/blogs/python-mini-project-speech-emotion-recognition/

# Libraries

import librosa
import soundfile
import os, glob, pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


In [6]:

#DataFlair - Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
    if mel:
        mel_spec = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_spec))

    return result




#DataFlair - Emotions in the RAVDESS dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

#DataFlair - Emotions to observe
observed_emotions=['calm', 'fearful', 'disgust', 'surprised', 'angry', 'sad']

def pad_features(feature, target_length):
    if len(feature) < target_length:
        return np.pad(feature, (0, target_length - len(feature)), 'constant')
    return feature

#DataFlair - Load the data and extract features for each sound file

def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("audio-files/ravdess/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [7]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.20)

#DataFlair - Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

#DataFlair - Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

#DataFlair - Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

#DataFlair - Train the model
model.fit(x_train,y_train)

#DataFlair - Predict for the test set
y_pred=model.predict(x_test)

#DataFlair - Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

#DataFlair - Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

(537, 135)
Features extracted: 180
Accuracy: 50.37%


In [8]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

#DataFlair - Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

#DataFlair - Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

alphas = [0.1, 0.05, 0.01, 0.001]
hidden_layer_size = [(300,), (300, 100), (300, 300)]
max_iter = [250, 500, 1000, 1500]


for i in alphas:
    for j in hidden_layer_size:
        for iter in max_iter:
#DataFlair - Initialize the Multi Layer Perceptron Classifier
            model=MLPClassifier(alpha=i, batch_size=256, epsilon=1e-08, hidden_layer_sizes=j, learning_rate='adaptive', max_iter=iter)

#DataFlair - Train the model
            model.fit(x_train,y_train)

#DataFlair - Predict for the test set
            y_pred=model.predict(x_test)

#DataFlair - Calculate the accuracy of our model
            accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

#DataFlair - Print the accuracy
            print(f"Alpha: {i}, Hidden Layer Size: {j}, Max Iter: {iter}, Accuracy: {accuracy*100:.2f}%")



(504, 168)
Features extracted: 180




Alpha: 0.1, Hidden Layer Size: (300,), Max Iter: 250, Accuracy: 68.45%




Alpha: 0.1, Hidden Layer Size: (300,), Max Iter: 500, Accuracy: 72.02%
Alpha: 0.1, Hidden Layer Size: (300,), Max Iter: 1000, Accuracy: 70.24%
Alpha: 0.1, Hidden Layer Size: (300,), Max Iter: 1500, Accuracy: 66.67%
Alpha: 0.1, Hidden Layer Size: (300, 100), Max Iter: 250, Accuracy: 66.07%




Alpha: 0.1, Hidden Layer Size: (300, 100), Max Iter: 500, Accuracy: 69.64%
Alpha: 0.1, Hidden Layer Size: (300, 100), Max Iter: 1000, Accuracy: 71.43%
Alpha: 0.1, Hidden Layer Size: (300, 100), Max Iter: 1500, Accuracy: 70.24%
Alpha: 0.1, Hidden Layer Size: (300, 300), Max Iter: 250, Accuracy: 64.88%
Alpha: 0.1, Hidden Layer Size: (300, 300), Max Iter: 500, Accuracy: 59.52%
Alpha: 0.1, Hidden Layer Size: (300, 300), Max Iter: 1000, Accuracy: 70.83%
Alpha: 0.1, Hidden Layer Size: (300, 300), Max Iter: 1500, Accuracy: 56.55%




Alpha: 0.05, Hidden Layer Size: (300,), Max Iter: 250, Accuracy: 69.64%




Alpha: 0.05, Hidden Layer Size: (300,), Max Iter: 500, Accuracy: 70.83%
Alpha: 0.05, Hidden Layer Size: (300,), Max Iter: 1000, Accuracy: 73.21%
Alpha: 0.05, Hidden Layer Size: (300,), Max Iter: 1500, Accuracy: 65.48%




Alpha: 0.05, Hidden Layer Size: (300, 100), Max Iter: 250, Accuracy: 66.67%
Alpha: 0.05, Hidden Layer Size: (300, 100), Max Iter: 500, Accuracy: 72.02%
Alpha: 0.05, Hidden Layer Size: (300, 100), Max Iter: 1000, Accuracy: 75.00%
Alpha: 0.05, Hidden Layer Size: (300, 100), Max Iter: 1500, Accuracy: 58.33%
Alpha: 0.05, Hidden Layer Size: (300, 300), Max Iter: 250, Accuracy: 55.36%
Alpha: 0.05, Hidden Layer Size: (300, 300), Max Iter: 500, Accuracy: 58.33%
Alpha: 0.05, Hidden Layer Size: (300, 300), Max Iter: 1000, Accuracy: 63.69%
Alpha: 0.05, Hidden Layer Size: (300, 300), Max Iter: 1500, Accuracy: 50.60%




Alpha: 0.01, Hidden Layer Size: (300,), Max Iter: 250, Accuracy: 70.83%




Alpha: 0.01, Hidden Layer Size: (300,), Max Iter: 500, Accuracy: 68.45%
Alpha: 0.01, Hidden Layer Size: (300,), Max Iter: 1000, Accuracy: 65.48%
Alpha: 0.01, Hidden Layer Size: (300,), Max Iter: 1500, Accuracy: 69.64%




Alpha: 0.01, Hidden Layer Size: (300, 100), Max Iter: 250, Accuracy: 64.29%
Alpha: 0.01, Hidden Layer Size: (300, 100), Max Iter: 500, Accuracy: 63.10%
Alpha: 0.01, Hidden Layer Size: (300, 100), Max Iter: 1000, Accuracy: 66.67%
Alpha: 0.01, Hidden Layer Size: (300, 100), Max Iter: 1500, Accuracy: 70.83%




Alpha: 0.01, Hidden Layer Size: (300, 300), Max Iter: 250, Accuracy: 72.02%
Alpha: 0.01, Hidden Layer Size: (300, 300), Max Iter: 500, Accuracy: 64.88%
Alpha: 0.01, Hidden Layer Size: (300, 300), Max Iter: 1000, Accuracy: 67.86%
Alpha: 0.01, Hidden Layer Size: (300, 300), Max Iter: 1500, Accuracy: 50.60%




Alpha: 0.001, Hidden Layer Size: (300,), Max Iter: 250, Accuracy: 66.07%




Alpha: 0.001, Hidden Layer Size: (300,), Max Iter: 500, Accuracy: 66.07%
Alpha: 0.001, Hidden Layer Size: (300,), Max Iter: 1000, Accuracy: 72.02%
Alpha: 0.001, Hidden Layer Size: (300,), Max Iter: 1500, Accuracy: 65.48%




Alpha: 0.001, Hidden Layer Size: (300, 100), Max Iter: 250, Accuracy: 69.05%
Alpha: 0.001, Hidden Layer Size: (300, 100), Max Iter: 500, Accuracy: 64.88%
Alpha: 0.001, Hidden Layer Size: (300, 100), Max Iter: 1000, Accuracy: 69.05%
Alpha: 0.001, Hidden Layer Size: (300, 100), Max Iter: 1500, Accuracy: 70.24%
Alpha: 0.001, Hidden Layer Size: (300, 300), Max Iter: 250, Accuracy: 55.95%
Alpha: 0.001, Hidden Layer Size: (300, 300), Max Iter: 500, Accuracy: 67.86%
Alpha: 0.001, Hidden Layer Size: (300, 300), Max Iter: 1000, Accuracy: 40.48%
Alpha: 0.001, Hidden Layer Size: (300, 300), Max Iter: 1500, Accuracy: 57.14%


In [21]:
speech_list = []
speech_names = []

for file in glob.glob("audio-files/*.wav"):
    file_name=os.path.basename(file)
    speech_names.append(file_name)
    feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
    speech_list.append(feature)

  return pitch_tuning(


In [22]:
#model.predict(speech_list)

In [134]:
def final_load_data():
    x,y=[],[]
    for file in glob.glob("audio-files/ravdess/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return x, y

In [135]:
#DataFlair - Split the dataset
x, y =final_load_data()

#DataFlair - Initialize the Multi Layer Perceptron Classifier
final_model=MLPClassifier(alpha=0.05, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,100), learning_rate='adaptive', max_iter=1000)

#DataFlair - Train the model
final_model.fit(x,y)

speech_predictions = final_model.predict(speech_list)

In [136]:
len(speech_predictions)

538

In [137]:
df = pd.read_csv("data-files/transcripts_w_lies.csv")

In [138]:
pred_df = pd.DataFrame({"names": speech_names, "emotion": speech_predictions})

In [139]:
pred_df.head()

Unnamed: 0,names,emotion
0,Donald Trump speaks at the RNC： FULL SPEECH_ch...,angry
1,Donald Trump speaks at the RNC： FULL SPEECH_ch...,angry
2,Donald Trump speaks at the RNC： FULL SPEECH_ch...,angry
3,Donald Trump speaks at the RNC： FULL SPEECH_ch...,angry
4,Donald Trump speaks at the RNC： FULL SPEECH_ch...,angry


In [140]:
df["names"] = df["Chunk Filename"].str.replace("/Users/milanvaghani/Desktop/Unstructed Machine Learning/", "")

In [141]:
df_ordered = df.sort_values(by = "names")
pred_df_ordered = pred_df.sort_values(by = "names")
emotion_ordered = pred_df_ordered["emotion"]
df_ordered["emotion"] = emotion_ordered

In [143]:
final_df = df_ordered.drop(columns = ["Unnamed: 0", "YouTube URL", "Chunk Filename"])

In [166]:
speech_number = []

for line in final_df["names"]:
    if line.startswith("Audio Files/Donald Trump speaks"):
        speech_number.append(3)
    elif line.startswith("VP Kamala Harris full speech"):
        speech_number.append(1)
    elif line.startswith("Audio Files/Donald Trump full"):
        speech_number.append(2)
    else:
        speech_number.append(4)

final_df["speech_number"] = speech_number

In [169]:
final_df.groupby(["speech_number", "lie"])["emotion"].value_counts()

speech_number  lie    emotion
1              False  angry       61
                      disgust      4
                      fearful      3
                      calm         1
               True   angry       10
                      disgust      1
2              False  angry      156
                      disgust     13
                      fearful      3
                      calm         1
               True   angry        8
3              False  angry      149
                      disgust     20
                      fearful      2
               True   angry       14
4              False  angry       81
                      disgust      6
                      fearful      2
               True   angry        3
Name: count, dtype: int64