# Demo Emotion Recognition: audio + video

## Libraries and parameters

In [47]:
# Utilities
import os
import subprocess
import numpy as np
from tensorflow import keras

# Audio and video manipulation
import moviepy.editor as mp
import cv2
import librosa
from sklearn.preprocessing import StandardScaler
from joblib import load
import tkinter as tk

In [48]:
# Labels dictionary
emotions_tras = {1:1, 2:4, 3:5, 4:0, 5:3, 6:2, 7:6}
emotions = {0:'angry', 1:'calm', 2:'disgust', 3:'fear', 4:'happy', 5:'sad', 6:'surprise'}

# Paths
dataset_path = "Datasets/Demo/"
parameters_path = 'Datasets/Audio_Speech/std_scaler.bin'
models_video_path = "Models/Video_stream/"
models_audio_path = "Models/Audio_stream/"
vlc_path = "C:/Program Files/VideoLAN/VLC/vlc.exe"

# Audio video parameters
height_targ = 112
width_targ = 112
sr = 48000

## Select Clip

In [49]:
root= tk.Tk()

canvas1 = tk.Canvas(root, width=400, height=300, relief='raised')
canvas1.pack()

label1 = tk.Label(root, text='Select clip to analize')
label1.config(font=('helvetica', 16))
canvas1.create_window(200, 25, window=label1)

label2 = tk.Label(root, text='Number from 0 to 6:')
label2.config(font=('helvetica', 11))
canvas1.create_window(200, 100, window=label2)

def display_text():
   global example
   example = int(example.get())
   root.destroy

example = tk.Entry(root)
example.pack()
canvas1.create_window(200, 140, window=example)

    
button1 = tk.Button(text='Select', command=lambda: [display_text(), root.destroy()], font=('helvetica', 12, 'bold'))
canvas1.create_window(200, 180, window=button1)

root.mainloop()

In [50]:
fn = os.listdir(dataset_path)
filename = dataset_path + fn[example]
label = emotions_tras[int(fn[example].split('-')[2]) - 1] # trasposition of the emotions

In [51]:
player = subprocess.call([vlc_path, filename, '--play-and-exit'])

## Data preparation

### Video

In [52]:
cap = cv2.VideoCapture(filename)
haar_cascade = cv2.CascadeClassifier('./Other/haarcascade_frontalface_default.xml')
frames = []
count = 0
skip = 3

# Loop through all frames
while True:
    # Capture frame
    ret, frame = cap.read()
    if (count % skip == 0 and count > 20):
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # detect and crop face
        faces = haar_cascade.detectMultiScale(frame, scaleFactor=1.12, minNeighbors=9)
        if len(faces) != 1:
            continue
        for (x, y, w, h) in faces:
            face = frame[y:y + h, x:x + w]

        face = cv2.resize(face, (height_targ+10, width_targ+10))
        face = face[5:-5, 5:-5]
        face = face/255.
        frames.append(face)
    count += 1

frames = np.array(frames)
num_frames = len(frames)
labels = [label] * num_frames
print('shape frames:', frames.shape)

shape frames: (34, 112, 112)


### Audio

In [53]:
audiofile = mp.AudioFileClip(filename).set_fps(sr)
audio = audiofile.to_soundarray()
audio = audio[int(sr/2):int(sr/2 + sr*3)]
audio = np.array([elem[0] for elem in audio])

In [54]:
mel = librosa.power_to_db(librosa.feature.melspectrogram(audio, sr = 48000, n_fft = 1024, n_mels = 128, fmin = 50, fmax = 24000)) 

scaler = load(parameters_path)
mel = scaler.transform(mel)

mel = np.expand_dims(mel, axis = 2)
mel = np.expand_dims(mel, axis = 0)
mel.shape

(1, 128, 282, 1)

## Load models

### Video

In [55]:
models_list = os.listdir(models_video_path)

acc = [float(model.split('[')[1].split(']')[0]) for model in models_list]
idx = acc.index(max(acc))                                                       # index of best model

model_video = keras.models.load_model(models_video_path + models_list[idx])
# model_video.summary()

### Audio

In [56]:
models_list = os.listdir(models_audio_path)
model_audio = keras.models.load_model(models_audio_path + models_list[0])
# model_audio.summary()

## Predictions

### Video

In [57]:
pred = model_video.predict(frames)
pred_video = np.mean(pred, axis=0)
pred_video



array([1.52530810e-02, 3.20698947e-01, 1.20977624e-04, 1.18637853e-03,
       3.03598702e-01, 8.11402686e-03, 3.51027817e-01], dtype=float32)

### Audio

In [58]:
pred = model_audio.predict(mel)
pred_audio = np.mean(pred, axis=0)
pred_audio



array([7.1527094e-05, 9.9606210e-01, 1.5615559e-03, 2.6035459e-06,
       4.6694062e-05, 2.2554970e-03, 1.2913418e-09], dtype=float32)

### Global

In [59]:
pred_global = pred_video + pred_audio # mean

In [60]:
print('Video prediction:\t', emotions[pred_video.argmax()])
print('Audio prediction:\t', emotions[pred_audio.argmax()])
print('Global prediction:\t', emotions[pred_global.argmax()])

print('Ground truth:\t\t', emotions[label])

Video prediction:	 surprise
Audio prediction:	 calm
Global prediction:	 calm
Ground truth:		 calm


In [61]:
# Print Results
root = tk.Tk()

# root window title and dimension
root.title("Results predictions")

canvas1 = tk.Canvas(root, width=400, height=300, relief='raised')
canvas1.pack()

label1 = tk.Label(root, text=f'Video prediction:\t{emotions[pred_video.argmax()]}')
label2 = tk.Label(root, text=f'Audio prediction:\t{emotions[pred_audio.argmax()]}')
label3 = tk.Label(root, text=f'Global prediction:\t{emotions[pred_global.argmax()]}')
label4 = tk.Label(root, text=f'Ground truth:\t{emotions[label]}')

label1.config(font=('helvetica', 14))
label2.config(font=('helvetica', 14))
label3.config(font=('helvetica', 14))
label4.config(font=('helvetica', 14), fg='gray')

canvas1.create_window(20, 25, window=label1, anchor='w')
canvas1.create_window(20, 50, window=label2, anchor='w')
canvas1.create_window(20, 75, window=label3, anchor='w')
canvas1.create_window(20, 120, window=label4, anchor='w')

button1 = tk.Button(text='Close', command=lambda: root.destroy(), font=('helvetica', 12, 'bold'))
canvas1.create_window(200, 200, window=button1)

root.mainloop()