In [1]:
import random
import tensorflow as tf
import numpy as np
import matplotlib as plt
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import json

tf.keras.backend.clear_session()
%precision 4

L2_WEIGHT_DECAY = 0.01
L1_WEIGHT_DECAY = 0.003

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [3]:
def load_json_data(path):
    with open(path, "r") as fp:
        data = json.load(fp)
    
    a = np.array(data["mfcc"])
    label = np.array(data["label"])

    mfcc = a[..., np.newaxis]
    print(mfcc.shape)

    return mfcc, label 

In [4]:
train_path = "../parse_dataset_labels/parse_sound_files/tess_ravdess_train_norm.json"
validate_path = "../parse_dataset_labels/parse_sound_files/tess_ravdess_validation_norm.json"
test_path = "../parse_dataset_labels/parse_sound_files/tess_ravdess_test_norm.json"

# Gets the list from the json
train_mfcc_list, train_label_list = load_json_data(train_path)
validate_mfcc_list, validate_label_list = load_json_data(validate_path)
test_mfcc_list, test_label_list = load_json_data(test_path)

# Shuffles the list, unzips the list, creates numpy arrays from the lists
x = list(zip(train_mfcc_list, train_label_list))
random.shuffle(x)
train_mfcc_tuple, train_label_tuple = zip(*x)
train_mfcc = np.array(train_mfcc_tuple)
train_label = np.array(train_label_tuple)

# Shuffles the list, unzips the list, creates numpy arrays from the lists
x = list(zip(validate_mfcc_list, validate_label_list))
random.shuffle(x)
validate_mfcc_tuple, validate_label_tuple = zip(*x)
validate_mfcc = np.array(validate_mfcc_tuple)
validate_label = np.array(validate_label_tuple)

# Shuffles the list, unzips the list, creates numpy arrays from the lists
x = list(zip(test_mfcc_list, test_label_list))
random.shuffle(x)
test_mfcc_tuple, test_label_tuple = zip(*x)
test_mfcc = np.array(test_mfcc_tuple)
test_label = np.array(test_label_tuple)


(9660, 87, 1)
(1260, 87, 1)
(1121, 87, 1)


In [5]:
# abc = tf.keras.utils.to_categorical(train_label)
# print(abc)
train_label = tf.keras.utils.to_categorical(train_label)
test_label = tf.keras.utils.to_categorical(test_label)
validate_label = tf.keras.utils.to_categorical(validate_label)
print(test_label.shape)

(1121, 5)


In [6]:
# using the hop length and fft params we have 87 time steps with 13 values for each
input_layer = tf.keras.layers.Input(shape=(87, 1, 1))
x = tf.keras.layers.Conv2D(96, (11,11), strides=4, padding="same", kernel_initializer='he_normal', bias_initializer="he_normal", kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(input_layer)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation("relu")(x)
x = tf.keras.layers.MaxPool2D((3,3), strides=2, padding="same")(x)

x = tf.keras.layers.Conv2D(256, (5,5), padding="same", kernel_initializer='he_normal', bias_initializer="he_normal", kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation("relu")(x)
x = tf.keras.layers.MaxPool2D((3,3), strides=2, padding="same")(x)

x = tf.keras.layers.Conv2D(384, (5,5), padding="same", kernel_initializer='he_normal', bias_initializer="he_normal", kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation("relu")(x)

x = tf.keras.layers.Conv2D(384, (5,5), padding="same", kernel_initializer='he_normal', bias_initializer="he_normal", kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation("relu")(x)

x = tf.keras.layers.Conv2D(256, (5,5), padding="same", kernel_initializer='he_normal', bias_initializer="he_normal", kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation("relu")(x)
x = tf.keras.layers.MaxPool2D((3,3), strides=2, padding="same")(x)

x = tf.keras.layers.Flatten()(x)

x = tf.keras.layers.Dense(4096, activation="relu", kernel_initializer='he_normal', bias_initializer="he_normal", kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(x)
x = tf.keras.layers.Dense(4096, activation="relu", kernel_initializer='he_normal', bias_initializer="he_normal", kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(x)
x = tf.keras.layers.Dense(5, activation="softmax", kernel_initializer='he_normal', bias_initializer="he_normal")(x)

model = Model(input_layer, x, name='alexNet')

model.summary()

Model: "alexNet"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 87, 1, 1)]        0         
_________________________________________________________________
conv2d (Conv2D)              (None, 22, 1, 96)         11712     
_________________________________________________________________
batch_normalization (BatchNo (None, 22, 1, 96)         384       
_________________________________________________________________
activation (Activation)      (None, 22, 1, 96)         0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 11, 1, 96)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 1, 256)        614656    
_________________________________________________________________
batch_normalization_1 (Batch (None, 11, 1, 256)        1024

In [7]:
# input_layer = tf.keras.layers.Input(shape=(train_mfcc.shape[1], train_mfcc.shape[2], train_mfcc.shape[3]))
# x = tf.keras.layers.Conv2D(96, (3,3), activation="relu", kernel_initializer='he_normal', bias_initializer="he_normal")(input_layer)
# x = tf.keras.layers.MaxPool2D((3,3), strides=2, padding="same")(x)
# # Batch normalization standardizes the activations of the current layer and what activations get sent to the next layer. this helps
# # the model converge a lot faster because it has normalized values flowing through the model
# x = tf.keras.layers.BatchNormalization()(x)

# x = tf.keras.layers.Conv2D(256, (3,3), activation="relu", kernel_initializer='he_normal', bias_initializer="he_normal")(x)
# x = tf.keras.layers.MaxPool2D((3,3), strides=2, padding="same")(x)
# x = tf.keras.layers.BatchNormalization()(x)

# x = tf.keras.layers.Conv2D(512, (3,3), activation="relu", kernel_initializer='he_normal', bias_initializer="he_normal")(x)
# x = tf.keras.layers.MaxPool2D((3,3), strides=2, padding="same")(x)
# x = tf.keras.layers.BatchNormalization()(x)

# x = tf.keras.layers.Flatten()(x)

In [8]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, decay=0.0001), loss='categorical_crossentropy', metrics=['acc'])

In [9]:
check_points = "../checkpoint/checkpoint_sound.hb/"
check_point_dir = os.path.dirname(check_points)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=check_point_dir, verbose=1, monitor="val_acc", save_best_only=True)

In [10]:
model.fit(train_mfcc, train_label, 
        validation_data=(validate_mfcc, validate_label), 
        verbose=1, 
        batch_size=32, 
        epochs=15,
        callbacks=[cp_callback])

Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.27063, saving model to ../checkpoint\checkpoint_sound.hb
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ../checkpoint\checkpoint_sound.hb\assets
Epoch 2/15
Epoch 00002: val_acc did not improve from 0.27063
Epoch 3/15
Epoch 00003: val_acc did not improve from 0.27063
Epoch 4/15
Epoch 00004: val_acc did not improve from 0.27063
Epoch 5/15
Epoch 00005: val_acc did not improve from 0.27063
Epoch 6/15
Epoch 00006: val_acc improved from 0.27063 to 0.32698, saving model to ../checkpoint\checkpoint_sound.hb
INFO:tensorflow:Assets written to: ../checkpoint\checkpoint_sound.hb\assets
Epoch 7/15
 43/302 [===>..........................] - ETA: 3:18 - loss: 1.6811 - acc: 0.4077

KeyboardInterrupt: 