In [None]:
import tensorflow as tf
import numpy as np
import os
import random
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold 
import warnings
warnings.filterwarnings('ignore')
tf.get_logger().setLevel('ERROR')

tfk = tf.keras
tfkl = tf.keras.layers
print(tf.__version__)

2.7.0


In [None]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

In [None]:
# Read the cvs file
dataset = pd.read_csv('Training.csv')

# Print the shape of dataset
print(dataset.shape)

# Print all the rows of dataset
dataset.head()

(68528, 7)


Unnamed: 0,Sponginess,Wonder level,Crunchiness,Loudness on impact,Meme creativity,Soap slipperiness,Hype root
0,7.97698,4.33494,10.67282,1.76692,3.2244,51.68146,3.65434
1,8.07824,4.44616,10.5616,1.70716,3.32566,51.563598,3.47672
2,8.02844,4.22372,10.5616,1.64906,3.1746,50.86308,3.47672
3,8.02844,4.22372,10.5616,1.70716,3.1746,45.841581,3.47672
4,7.87572,4.44616,10.45038,1.70716,3.27586,47.126421,3.47672


In [None]:
# Build sequence 
def build_sequences(df, target_labels, window=200, stride=20, telescope=100):
    # Sanity check to avoid runtime errors
    assert window % stride == 0
    dataset = []
    labels = []
    temp_df = df.copy().values
    temp_label = df[target_labels].copy().values
    padding_len = len(df)%window

    if(padding_len != 0):
        # Compute padding length
        padding_len = window - len(df)%window
        padding = np.zeros((padding_len,temp_df.shape[1]), dtype='float64')
        temp_df = np.concatenate((padding,df))
        padding = np.zeros((padding_len,temp_label.shape[1]), dtype='float64')
        temp_label = np.concatenate((padding,temp_label))

        assert len(temp_df) % window == 0

    for idx in np.arange(0,len(temp_df)-window-telescope,stride):
        dataset.append(temp_df[idx:idx+window])
        labels.append(temp_label[idx+window:idx+window+telescope])

    dataset = np.array(dataset)
    labels = np.array(labels)
    return dataset, labels

In [None]:
# Window size
window = 300
# Stride size
stride = 20

In [None]:
target_labels = dataset.columns
# Telescope size
telescope = 50

In [None]:
X, y = build_sequences(dataset, target_labels, window, stride, telescope)
X.shape, y.shape

((3418, 300, 7), (3418, 50, 7))

In [None]:
# Apply 5-fold cross validation
kf = KFold(n_splits=5, random_state=None, shuffle=False)
indexes = kf.split(X,y)
for train_index, test_index in indexes:
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)

TRAIN: [684 685 686 687 688 689 690 691 692 693] TEST: [0 1 2 3 4 5 6 7 8 9]
TRAIN: (2734,) TEST: (684,)
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [684 685 686 687 688 689 690 691 692 693]
TRAIN: (2734,) TEST: (684,)
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [1368 1369 1370 1371 1372 1373 1374 1375 1376 1377]
TRAIN: (2734,) TEST: (684,)
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [2052 2053 2054 2055 2056 2057 2058 2059 2060 2061]
TRAIN: (2735,) TEST: (683,)
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [2735 2736 2737 2738 2739 2740 2741 2742 2743 2744]
TRAIN: (2735,) TEST: (683,)


In [None]:
# Set input shape
input_shape = X.shape[1:]
# Set output shape
output_shape = y.shape[1:]
# Set the batch size
batch_size = 64
# Set the number of epochs for training
epochs = 200

In [None]:
# Build the Bidirectional recurrent neural network
def build_CONV_Bidirectional_model(input_shape, output_shape):
    # Build the neural network layer by layer
    input_layer = tfkl.Input(shape=input_shape, name='Input')

    convlstm = tfkl.Bidirectional(tfkl.LSTM(64, return_sequences=True))(input_layer)
    convlstm = tfkl.Conv1D(128, 3, padding='same', activation='relu')(convlstm)
    convlstm = tfkl.MaxPool1D()(convlstm)
    convlstm = tfkl.Bidirectional(tfkl.LSTM(128, return_sequences=True))(convlstm)
    convlstm = tfkl.Conv1D(256, 3, padding='same', activation='relu')(convlstm)
    convlstm = tfkl.GlobalAveragePooling1D()(convlstm)
    convlstm = tfkl.Dropout(.5)(convlstm)

    dense = tfkl.Dense(output_shape[-1]*output_shape[-2], activation='relu')(convlstm)
    output_layer = tfkl.Reshape((output_shape[-2],output_shape[-1]))(dense)
    output_layer = tfkl.Conv1D(output_shape[-1], 1, padding='same')(output_layer)

    # Connect input and output through the Model class
    model = tfk.Model(inputs=input_layer, outputs=output_layer, name='model')

    # Compile the model
    model.compile(loss=tfk.losses.MeanSquaredError(), optimizer=tfk.optimizers.Adam(), metrics=['mae', 'mse'])

    # Return the model
    return model

In [None]:
mse = []
mae = []
models = []

In [None]:
k = 0
for train_index, test_index in kf.split(X,y):
  k = k + 1
  # Instantiate the neural network
  model = build_CONV_Bidirectional_model(input_shape, output_shape)
  
  # Train the model
  history = model.fit(
      x = X[train_index],
      y = y[train_index],
      batch_size = batch_size,
      epochs = epochs,
      validation_split=.1,
      callbacks = [
          # Early Stopping
          tfk.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10, restore_best_weights=True),
          # Method to reduce learning rate when a metric has stopped improving
          tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=5, factor=0.5, min_lr=1e-5)
      ]
  ).history

  # Start the prediction
  predictions = model.predict(X[test_index])
  
  # Print mean squared error for each step
  mean_squared_error = tfk.metrics.mse(y[test_index].flatten(),predictions.flatten())
  
  # Print mean absolute error for each step
  mean_absolute_error = tfk.metrics.mae(y[test_index].flatten(),predictions.flatten())
  
  mse.append(mean_squared_error) 
  mae.append(mean_absolute_error)
  models.append(model)
  
  # Save the five models, one for each step of cross validation. Only the model with 
  # the best rmse will be submitted in Codalab. We notice that the best model 
  # gave a better perfomance on Codalab then the one retrained on the entire dataset.
  model.save('Bidirectional_tel50_ver' + str(k))



Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200




Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200




Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200




Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200




Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200




In [None]:
# Cross validation, with this data, is pessimistically biased. Indeed the rmse score 
# given by this fit is worse than the score given by submission in Codelab.

# Print the mean squared error for each model
for m1 in mse:
  print(m1.numpy())
# Print the mean absolute error for each model
for m2 in mae:
  print(m2.numpy())


55.298748
19.436071
35.455498
22.377926
40.232815
4.796805
2.8483741
3.6275663
2.6400597
3.7797348


In [None]:
# Compute the mean of mse for the models
mse_cv = tf.math.reduce_mean(mse)

# Compute the mean of rmse for the models
rmse_cv = tf.math.sqrt(mse_cv.numpy())

# Compute the mean of mae for the models
mae_cv = tf.math.reduce_mean(mae)

print('mse average: ', mse_cv.numpy())
print('rmse average: ', rmse_cv.numpy())
print('mae average: ', mae_cv.numpy())

mse average:  34.56021
rmse average:  5.8787937
mae average:  3.538508


### Model Ensemble

During this phase we also tried a Model Ensemble approach. We started by increasing the window size, searching for a good score on the train set and a poor one on the test phase. Once obtained this overfitting models we averaged the 5 predictions. This approach gave us worse results then the single model, probably because the models were still too similar to each other.