In [None]:
# libraries
import numpy as np
import pandas as pd
import os
import pathlib
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
import torchaudio
import librosa.display
import IPython.display as ipd
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
############################################################################################
##########
# input data
paths = pathlib.Path("/kaggle/input/dataset/Acted Emotional Speech Dynamic Database").glob("**/*.wav")
paths_list = list(paths)
# extract dataset
data = []
for path in tqdm(paths_list):
 name = str(path).split('/')[-1].split('.')[0]
 label = str(path).split('/')[-2]
 try:
 s = torchaudio.load(path)
 data.append({"name": name, "speech": path, "label": label})
 except Exception as e:
 pass
df = pd.DataFrame(data)
df = df.reset_index(drop=True)
############################################################################################
##########
# data analysis
for category in df["label"].unique():
 temp = (df['label'] == category).sum()
 print("label ", category, " has in total ", temp, " features")
############################################################################################
##########
# display the waveform of the audio file
def waveplot(data, sr, emotion):
 plt.figure(figsize=(10,4))
 plt.title(emotion, size=20)
 librosa.display.waveshow(data, sr=sampling_rate)
 plt.show()
emotion = 'fear'
path = np.array(df['speech'][df['label']==emotion])[0]
data, sampling_rate = librosa.load(path)
waveplot(data, sampling_rate, emotion)
display(ipd.Audio(path))
print("\n\n")
emotion = 'sadness'
path = np.array(df['speech'][df['label']==emotion])[0]
data, sampling_rate = librosa.load(path)
waveplot(data, sampling_rate, emotion)
display(ipd.Audio(path))
print("\n\n")
emotion = 'disgust'
path = np.array(df['speech'][df['label']==emotion])[0]
data, sampling_rate = librosa.load(path)
waveplot(data, sampling_rate, emotion)
display(ipd.Audio(path))
print("\n\n")
emotion = 'happiness'
path = np.array(df['speech'][df['label']==emotion])[0]
data, sampling_rate = librosa.load(path)
waveplot(data, sampling_rate, emotion)
display(ipd.Audio(path))
print("\n\n")
emotion = 'anger'
path = np.array(df['speech'][df['label']==emotion])[0]
data, sampling_rate = librosa.load(path)
waveplot(data, sampling_rate, emotion)
display(ipd.Audio(path))
############################################################################################
##########
# Feature Extraction
def extract_mfcc(filename):
 y, sr = librosa.load(filename, duration=3, offset=0.5)
 mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
 return mfcc
X_mfcc = df['speech'].apply(lambda x: extract_mfcc(x))
X = np.array([x for x in X_mfcc])
X = np.expand_dims(X, -1)
# anger: 0 0 0 0 1
enc = OneHotEncoder()
y = enc.fit_transform(df[['label']])
y = y.toarray()
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
############################################################################################
##########
# neural network
model = Sequential([
 LSTM(256, return_sequences=False, input_shape=(40,1)),
 Dropout(0.2),
 Dense(128, activation='relu'),
 Dropout(0.2),
 Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train, y_train, validation_split=0.2, epochs=50, batch_size=64)
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
############################################################################################
##########
# Plot the results
epochs = list(range(50))
acc, val_acc = history.history['accuracy'], history.history['val_accuracy']
plt.plot(epochs, acc, label='train accuracy')
plt.plot(epochs, val_acc, label='val accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()
loss, val_loss = history.history['loss'], history.history['val_loss']
plt.plot(epochs, loss, label='train loss')
plt.plot(epochs, val_loss, label='val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
############################################################################################
##########
# actual results
y_pred = model.predict(x_test)
y_pred = enc.inverse_transform(y_pred)
y_test = enc.inverse_transform(y_test)
pred_test = model.predict(x_test)
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()
print(df.head())