In [1]:
# Utilities
import os
import numpy as np
from tensorflow import keras

# Audio and video manipulation
import moviepy.editor as mp
import cv2
import librosa
from sklearn.preprocessing import StandardScaler

In [2]:
# Labels dictionary
emotions_tras = {1:1, 2:4, 3:5, 4:0, 5:3, 6:2, 7:6}
emotions = {0:'angry', 1:'calm', 2:'disgust', 3:'fear', 4:'happy', 5:'sad', 6:'surprise'}

# Paths
dataset_path = "Datasets/Demo/"
models_video_path = "Models/Video_stream/"
models_audio_path = "Models/Audio_stream/"

# Video parameters
height_targ = 112
width_targ = 112
sr = 48000

In [18]:
# Select video
example = 0

fn = os.listdir(dataset_path)
filename = dataset_path + fn[example]
label = emotions_tras[int(fn[example].split('-')[2]) - 1] # trasposition of the emotions

## Data preparation

### Video

In [19]:
cap = cv2.VideoCapture(filename)
haar_cascade = cv2.CascadeClassifier('./Other/haarcascade_frontalface_default.xml')
frames = []
count = 0
skip = 3

# Loop through all frames
while True:
    # Capture frame
    ret, frame = cap.read()
    if (count % skip == 0 and count > 20):
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # detect and crop face
        faces = haar_cascade.detectMultiScale(frame, scaleFactor=1.12, minNeighbors=9)
        if len(faces) != 1:
            continue
        for (x, y, w, h) in faces:
            face = frame[y:y + h, x:x + w]

        face = cv2.resize(face, (height_targ+10, width_targ+10))
        face = face[5:-5, 5:-5]
        face = face/255.
        frames.append(face)
    count += 1

frames = np.array(frames)
num_frames = len(frames)
labels = [label] * num_frames
print('shape frames:', frames.shape)

shape frames: (34, 112, 112)


In [20]:
# plt.imshow(frames[20], cmap='gray')

In [21]:
# frames_tf = tf.data.Dataset.from_tensor_slices(frames)
# labels_tf = tf.data.Dataset.from_tensor_slices(labels)

### Audio

In [22]:
audiofile = mp.AudioFileClip(filename).set_fps(sr)

In [23]:
audio = audiofile.to_soundarray()
audio = audio[int(sr/2):int(sr/2 + sr*3)]
audio = np.array([elem[0] for elem in audio])

In [24]:
# # plt.figure(figsize = (15, 5))
# import librosa.display
# import IPython.display as ipd
# librosa.display.waveshow(audio, sr = sr)
# ipd.Audio(audio, )

In [25]:
mel = librosa.power_to_db(librosa.feature.melspectrogram(audio, sr = 48000, n_fft = 1024, n_mels = 128, fmin = 50, fmax = 24000)) 
# plt.figure()
# librosa.display.specshow(mel, sr = 48000, x_axis = 'time', y_axis = 'mel', cmap = 'magma')
# plt.colorbar(label = 'dB')
# plt.title('Mel-Spectrogram (dB)', fontdict = dict(size = 15))
# plt.xlabel('Time', fontdict = dict(size = 12))
# plt.ylabel('Frequency', fontdict = dict(size = 12))
# plt.show()
scaler = StandardScaler()
mel = scaler.fit_transform(mel)

mel = np.expand_dims(mel, axis = 2)
mel = np.expand_dims(mel, axis = 0)
mel.shape

(1, 128, 282, 1)

## Load best models

In [26]:
models_list = os.listdir(models_video_path)

acc = [float(model.split('[')[1].split(']')[0]) for model in models_list]
idx = acc.index(max(acc))                                                       # index of best model

model_video = keras.models.load_model(models_video_path + models_list[idx])
# reconstructed_model.summary()

In [27]:
models_list = os.listdir(models_audio_path)
model_audio = keras.models.load_model(models_audio_path + models_list[0])
# reconstructed_model.summary()

## Predictions

### Video

In [28]:
pred = model_video.predict(frames)
pred_video = np.mean(pred, axis=0)
pred_video



array([1.5253071e-02, 3.2069898e-01, 1.2097736e-04, 1.1863795e-03,
       3.0359864e-01, 8.1140250e-03, 3.5102782e-01], dtype=float32)

### Audio

In [30]:
pred = model_audio.predict(mel)
pred_audio = np.mean(pred, axis=0)
pred_audio



array([3.4228640e-07, 9.9562365e-01, 9.4374154e-06, 6.2736740e-07,
       2.8438035e-05, 4.3374286e-03, 7.2007183e-12], dtype=float32)

### Global

In [31]:
pred_global = pred_video + pred_audio # mean

In [32]:
print('Video prediction:', emotions[pred_video.argmax()])
print('Audio prediction:', emotions[pred_audio.argmax()])

print('Global prediction:', emotions[pred_global.argmax()])
print('Ground truth:', emotions[label])

Video prediction: surprise
Audio prediction: calm
Global prediction: calm
Ground truth: calm
