In [58]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import pydub
from pydub import AudioSegment, effects
import librosa
from librosa import display
import noisereduce as nr
import tensorflow as tf
import keras
import sklearn

from tqdm import tqdm

import skimage.io
from skimage.transform import rescale, resize, downscale_local_mean
from skimage.util import img_as_ubyte

In [59]:
IMG_SIZE = (224, 224)
SEGMENT_DURATION = 5
SEGMENT_STEP = 2

NOICE_REDUCTION = True

TAG = f'nr{NOICE_REDUCTION}_step{SEGMENT_STEP}s_len{SEGMENT_DURATION}s'

In [60]:
OUTPUT_FOLDER = f'../../data/prepared/mel-spectrogram/combined/prepared_images_{IMG_SIZE[0]}_step{SEGMENT_STEP}s_len{SEGMENT_DURATION}s'
EMOTIONS = ['happy', 'surprise', 'anger', 'sad', 'neutral', 'disgust', 'fear']

EMOTIONS_MAP = {
    'happy': 0,
    'surprise': 1,
    'anger': 2,
    'sad': 3,
    'neutral': 4,
    'disgust': 5,
    'fear': 6
}
#
# IMG_SIZE = (299, 299)
# AUDIO_LENGTH = 5
#
# if not os.path.isdir(OUTPUT_FOLDER):
#     os.mkdir(OUTPUT_FOLDER)
#
# for emotion in EMOTIONS:
#     if not os.path.isdir(f'{OUTPUT_FOLDER}/{emotion}'):
#         os.mkdir(f'{OUTPUT_FOLDER}/{emotion}')

# Modelling

In [61]:
import random

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

from sklearn.model_selection import train_test_split

In [62]:
EPOCHS = 50

In [63]:
images = []
labels = []

for emotion in EMOTIONS:
    for img in tqdm(os.listdir(f'{OUTPUT_FOLDER}/{emotion}')):

        images.append(np.array(skimage.io.imread(f'{OUTPUT_FOLDER}/{emotion}/{img}')))
        labels.append(emotion)

100%|██████████| 345/345 [00:00<00:00, 487.29it/s]
100%|██████████| 371/371 [00:00<00:00, 445.38it/s]
100%|██████████| 412/412 [00:00<00:00, 507.04it/s]
100%|██████████| 397/397 [00:00<00:00, 634.38it/s]
100%|██████████| 469/469 [00:00<00:00, 631.47it/s]
100%|██████████| 439/439 [00:00<00:00, 643.49it/s]
100%|██████████| 352/352 [00:00<00:00, 631.96it/s]


In [64]:
len(images)

2785

In [65]:
for index, img in enumerate(images):
    if img.shape[0] != 224 or img.shape[1] != 224:
        print (img.shape)
        print (index)

In [66]:
images = np.array(images) / 255

tmp = list(zip(images, labels))
random.shuffle(tmp)

images, labels = zip(*tmp)
images = np.array(images)

images = np.repeat(images[..., np.newaxis], 3, -1)# (64, 224, 224, 3)
print(images.shape)

(2785, 224, 224, 3)


In [67]:
labels = np.array(labels)
df__ = pd.Series(labels)
labels = df__.map(EMOTIONS_MAP)
labels = tf.keras.utils.to_categorical(np.array(labels), len(EMOTIONS))

In [68]:
# X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.33, random_state=42)

In [71]:
def build_model():
    model = Sequential()
    model.add(tf.keras.applications.ResNet50V2(
        include_top=False,
        weights="imagenet",
        pooling="avg"
    ))

    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(len(EMOTIONS), activation='softmax'))

    # We don't need to retrain ResNet50
    model.layers[0].trainable = False

    model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate = 0.001),
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                  metrics=['accuracy'])

    return model

In [72]:
model = build_model()
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50v2 (Functional)     (None, 2048)              23564800  
                                                                 
 dense_11 (Dense)            (None, 512)               1049088   
                                                                 
 dense_12 (Dense)            (None, 256)               131328    
                                                                 
 dense_13 (Dense)            (None, 7)                 1799      
                                                                 
Total params: 24,747,015
Trainable params: 1,182,215
Non-trainable params: 23,564,800
_________________________________________________________________


In [73]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [74]:
import gc
gc.collect()

8817

In [75]:
train_history = model.fit(images[:int(len(images))], labels[:int(len(labels))], epochs=20, validation_split=0.25, callbacks=[stop_early], shuffle=True, batch_size=6)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [76]:
val_acc_per_epoch = train_history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print(f'Best epoch: {best_epoch}')

Best epoch: 6


In [77]:
model.save(f'models/ResNet50V2_Dense256_combined_acc92_{TAG}')



INFO:tensorflow:Assets written to: models/ResNet50V2_Dense256_combined_acc92_nrTrue_step2s_len5s\assets


INFO:tensorflow:Assets written to: models/ResNet50V2_Dense256_combined_acc92_nrTrue_step2s_len5s\assets


In [328]:
a = model.predict(np.expand_dims(images[0], axis=0))



In [333]:
print(np.argmax(a))
print(list(EMOTIONS_MAP.keys())[list(EMOTIONS_MAP.values()).index(np.argmax(a))])

3
sad


In [334]:
print(labels[0])

[0. 0. 0. 1. 0. 0. 0.]
