In [1]:
from google.colab import drive
import os
import pandas as pd
from pathlib import Path

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# Mount GDrive to access GDrive files
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
# Change working directory
os.chdir("/content/gdrive/Shareddrives/emo-challenge")

In [6]:
# Check working directory (should be .../emo-challenge)
cwd = os.getcwd()
print(cwd)


/content/gdrive/Shareddrives/emo-challenge


In [118]:
### Load in training data - separately assembled

annot = pd.read_csv("data/scenario_1/train/annotations_train.csv")

phys = pd.read_csv('data/scenario_1/train/physiology_train.csv')

In [33]:
### Load in test data - modified provided function

def load_data_no_folds(scenario_dir_path, dataset_type):
    # make dict to store data
    storage_list = list()
    # make paths for the specified dataset
    train_annotations_dir = Path(scenario_dir_path, dataset_type, "annotations")
    train_physiology_dir = Path(scenario_dir_path, dataset_type, "physiology")
    # sort contents of dirs, so that physiology and annotations are in the same order  
    train_physiology_files = sorted([f for f in Path(train_physiology_dir).iterdir()])
    train_annotation_files = sorted([f for f in Path(train_annotations_dir).iterdir()])

    #print(train_physiology_files)
    #print(train_annotation_files)
    # iterate over annotation and physiology files
    full_df = pd.DataFrame()
    for physiology_file_path, annotations_file_path in zip(train_physiology_files, train_annotation_files):
        # make sure that we load corresponding physiology and annotations
        #print(physiology_file_path)
        #print(annotations_file_path)
        assert physiology_file_path.name == annotations_file_path.name, "Order mismatch"
        # load data from files
        df_physiology = pd.read_csv(physiology_file_path, index_col="time")
        df_annotations = pd.read_csv(annotations_file_path, index_col="time")

        split_name = annotations_file_path.name.split('.')[0].split('_')
        sub_no = split_name[1]
        vid_no = split_name[3]

        current_df = pd.merge(df_physiology, df_annotations, on="time", how="outer")
        current_df["sub"] = [sub_no] * len(current_df)
        current_df["vid"] = [vid_no] * len(current_df)
        
        full_df = pd.concat([full_df, current_df], ignore_index=True)
       
        #continue # comment / delete this line if you want to store data in data_store list
        # store data
        #storage_list.append((annotations_file_path.name, df_physiology, df_annotations))
        
    
    return full_df
      
# specify scenario path
scenario_dir = "data/scenario_1"

# train data
#print("Loading train data")
#load_data_no_folds(scenario_dir, "train")

# test data
print("Loading test data")
test_data = load_data_no_folds(scenario_dir, "test")

Loading test data


In [38]:
### Save assembled test data  for later

test_data.to_csv('data/scenario_1/test/test_data.csv')

In [7]:
#####################################
##### TRAINING DATA PROCESSING ######
#####################################

# Initialize training and validation datasets with zeros
X_train = np.zeros((240,2238,4600))
Y_train = np.zeros((240,2238,2))
X_val = np.zeros((240,300,4600))
Y_val = np.zeros((240,300,2))

# Set window size and define maximum number of windows
window_size = 100
max_n_windows = 2538

for s,sub in enumerate(list(set(annot['sub']))):
    for v,vid in enumerate(list(set(annot['vid']))):

        # Load physiological data
        phys_data = np.array(phys.loc[phys['sub']==sub].loc[phys['vid']==vid].iloc[:,2:10])

        # Create one-hot encodings for subject and video
        one_hot = np.zeros((len(phys_data),38))
        one_hot[:,s] = 1
        one_hot[:,30+v] = 1

        # Stack phys data with one-hot
        phys_data = np.column_stack((phys_data,one_hot))
        
        # Load ratings of valence and arousal
        ratings = np.array(annot.loc[annot['sub']==sub].loc[annot['vid']==vid][['valence','arousal']])
        
        # Stack physiological data and one-hot with ratings across time - define time windows
        num_windows = ratings.shape[0] - 2
        stacked_data = np.zeros((num_windows, int(46*window_size)+2))
        for i in range(num_windows):
            stacked_data[i] = np.hstack((phys_data[50*i:50*(i+2)].flatten(), ratings[i]))

        # Assign batch to the full dataset with pre-padding (leaving 0s in the beginning)
        # Also, use last 300 ratings (15s) as validation set
        X_train[s*8+v][(max_n_windows-num_windows):][:] = stacked_data[:-300,:-2]
        X_val[s*8+v][:][:] = stacked_data[-300:,:-2]

        Y_train[s*8+v][(max_n_windows-num_windows):][:] = stacked_data[:-300,-2:]
        Y_val[s*8+v][:][:] = stacked_data[-300:,-2:]
        #break
    #break

In [116]:
### Save subject and video orderings to match the test data

sub_list_train = list(set(annot['sub']))

vid_list_train = list(set(annot['vid']))

['12',
 '33',
 '32',
 '18',
 '34',
 '22',
 '8',
 '31',
 '4',
 '6',
 '35',
 '37',
 '9',
 '30',
 '41',
 '13',
 '14',
 '43',
 '17',
 '1',
 '29',
 '26',
 '19',
 '7',
 '11',
 '28',
 '36',
 '45',
 '38',
 '20']

In [141]:
#####################################
##### TEST DATA PROCESSING ######
#####################################

# Initialize test dataset - only features
X_test = np.zeros((240,601,4600))
window_size = 100

for s,sub in enumerate(sub_list_train):
    for v,vid in enumerate(vid_list_train):
        print('Subject #',sub,'; video #',vid)
        
        # Load physiological data
        phys_data = np.array(test_data.loc[test_data['sub']==str(sub)].loc[test_data['vid']==str(vid)].iloc[9950:40050,:8])

        # Create one-hot encodings for subject and video
        one_hot = np.zeros((len(phys_data),38))
        one_hot[:,s] = 1
        one_hot[:,30+v] = 1

        # Combine the two
        phys_data = np.column_stack((phys_data,one_hot))
        
        
        # Create sliding windows
        num_windows = 601
        stacked_data = np.zeros((num_windows, int(46*window_size)))
        for i in range(num_windows):
            stacked_data[i] = phys_data[50*i:50*(i+2)].flatten()
 

        # Add sliding windows data to the larger dataset
        X_test[s*8+v][:][:] = stacked_data
        #break
    #break


Subject # 1 ; video # 1
Subject # 1 ; video # 9
Subject # 1 ; video # 10
Subject # 1 ; video # 11
Subject # 1 ; video # 13
Subject # 1 ; video # 14
Subject # 1 ; video # 18
Subject # 1 ; video # 20
Subject # 4 ; video # 1
Subject # 4 ; video # 9
Subject # 4 ; video # 10
Subject # 4 ; video # 11
Subject # 4 ; video # 13
Subject # 4 ; video # 14
Subject # 4 ; video # 18
Subject # 4 ; video # 20
Subject # 6 ; video # 1
Subject # 6 ; video # 9
Subject # 6 ; video # 10
Subject # 6 ; video # 11
Subject # 6 ; video # 13
Subject # 6 ; video # 14
Subject # 6 ; video # 18
Subject # 6 ; video # 20
Subject # 7 ; video # 1
Subject # 7 ; video # 9
Subject # 7 ; video # 10
Subject # 7 ; video # 11
Subject # 7 ; video # 13
Subject # 7 ; video # 14
Subject # 7 ; video # 18
Subject # 7 ; video # 20
Subject # 8 ; video # 1
Subject # 8 ; video # 9
Subject # 8 ; video # 10
Subject # 8 ; video # 11
Subject # 8 ; video # 13
Subject # 8 ; video # 14
Subject # 8 ; video # 18
Subject # 8 ; video # 20
Subject # 

In [None]:
del phys, annot, phys_data, ratings, one_hot, stacked_data #to save memory

In [63]:
# Reshape data to prepare for timeseries batching - turn 4600 into 100x46 (timepoints x features)
X_train = X_train.reshape(240,2238,100,46)
y_train = Y_train.reshape(240,2238,2)

X_val = X_val.reshape(240,300,100,46)
y_val = Y_val.reshape(240,300,2)

NameError: ignored

In [142]:
# Reshape test data in the same way
X_test = X_test.reshape(240,601,100,46)

In [None]:
np.shape(X_train) # check shapes

(240, 2538, 100, 46)

In [66]:
np.shape(X_test)

(240, 601, 100, 46)

In [29]:
############################
#### MODEL ARCHITECTURE ####
############################

model = keras.Sequential()
model.add(keras.layers.Masking(mask_value=0.0, input_shape=(100, 46)))
model.add(keras.layers.LSTM(units=128))
print(model.output_shape)  
model.add(keras.layers.Dense(units=2, activation='linear'))
print(model.output_shape)  

# Compile model
model.compile(optimizer="adam", loss="mse")

(None, 128)
(None, 2)


In [30]:
############################
###### MODEL FITTING #######
############################

# Create an empty dictionary to store the history
history_dict = {}

# Create callbacks
callbacks = [
    keras.callbacks.ModelCheckpoint("scen1_lstm_128_gradual_epochs.keras",
                                   save_best_only=True)
]

# Fitting batches serially, one epoch at a time to avoid overfitting (due to resource constraints could not fit in parallel)
for e in range(20):
  for i in range(240):
      model.reset_states()  # reset the LSTM layer state between batches
      history = model.fit(X_train[i], y_train[i], batch_size=2238, epochs=1,
                        validation_data=(X_val[i], y_val[i]),
                        callbacks=callbacks, shuffle=False)
      history_dict[f'batch_{i+1}_epoch{e+1}'] = history.history
    #break



In [31]:
# Save the weights
model.save_weights('checkpoints/scen1_end_lstm_128_gradual_epochs')

In [69]:
####################
#### LOAD MODEL ####
####################

# Define architecture again
model1 = keras.Sequential()
model1.add(keras.layers.Masking(mask_value=0.0, input_shape=(100, 46)))
model1.add(keras.layers.LSTM(units=128))
model1.add(keras.layers.Dense(units=2, activation='linear'))

# Compile model
model1.compile(optimizer="adam", loss="mse")

# Load weights
model1.load_weights('checkpoints/scen1_end_lstm_128_gradual_epochs')

(None, 128)
(None, 2)


In [74]:


mse_list = []

for i in range(240):
  mse = model1.evaluate(X_val[i], y_val[i], verbose=2, batch_size=300)
  print('Sub #'+str((i-i%8)/8),' Vid #'+str(i%8))
  rmse = np.sqrt(mse)
  print("Model RMSE:", rmse)
  mse_list.append(mse)

1/1 - 0s - loss: 1.6619 - 128ms/epoch - 128ms/step
Sub #0.0  Vid #0
Model RMSE: 1.2891324111013773
1/1 - 0s - loss: 0.3211 - 128ms/epoch - 128ms/step
Sub #0.0  Vid #1
Model RMSE: 0.5666706388932672
1/1 - 0s - loss: 2.0603 - 128ms/epoch - 128ms/step
Sub #0.0  Vid #2
Model RMSE: 1.4353819460422965
1/1 - 0s - loss: 1.3378 - 129ms/epoch - 129ms/step
Sub #0.0  Vid #3
Model RMSE: 1.1566158952939047
1/1 - 0s - loss: 1.7211 - 129ms/epoch - 129ms/step
Sub #0.0  Vid #4
Model RMSE: 1.3119148584919704
1/1 - 0s - loss: 0.9915 - 129ms/epoch - 129ms/step
Sub #0.0  Vid #5
Model RMSE: 0.9957597958109019
1/1 - 0s - loss: 0.4816 - 132ms/epoch - 132ms/step
Sub #0.0  Vid #6
Model RMSE: 0.6939895130513942
1/1 - 0s - loss: 1.2735 - 129ms/epoch - 129ms/step
Sub #0.0  Vid #7
Model RMSE: 1.1285162123232615
1/1 - 0s - loss: 0.1944 - 135ms/epoch - 135ms/step
Sub #1.0  Vid #0
Model RMSE: 0.44092868187817275
1/1 - 0s - loss: 0.0767 - 129ms/epoch - 129ms/step
Sub #1.0  Vid #1
Model RMSE: 0.2769468314850631
1/1 - 0s 

KeyboardInterrupt: ignored

In [32]:
##################################
### EVALUATE ON VALIDATION SET ###
##################################

mse_list = []

for i in range(240):
  model.reset_states()
  mse = model.evaluate(X_val[i], y_val[i], verbose=2, batch_size=300)
  print('Sub #'+str((i-i%8)/8),' Vid #'+str(i%8))
  rmse = np.sqrt(mse)
  print("Model RMSE:", rmse)
  mse_list.append(mse)

1/1 - 0s - loss: 1.6619 - 128ms/epoch - 128ms/step
Sub #0.0  Vid #0
Model RMSE: 1.2891324111013773
1/1 - 0s - loss: 0.3211 - 134ms/epoch - 134ms/step
Sub #0.0  Vid #1
Model RMSE: 0.5666706388932672
1/1 - 0s - loss: 2.0603 - 124ms/epoch - 124ms/step
Sub #0.0  Vid #2
Model RMSE: 1.4353819460422965
1/1 - 0s - loss: 1.3378 - 134ms/epoch - 134ms/step
Sub #0.0  Vid #3
Model RMSE: 1.1566158952939047
1/1 - 0s - loss: 1.7211 - 132ms/epoch - 132ms/step
Sub #0.0  Vid #4
Model RMSE: 1.3119148584919704
1/1 - 0s - loss: 0.9915 - 126ms/epoch - 126ms/step
Sub #0.0  Vid #5
Model RMSE: 0.9957597958109019
1/1 - 0s - loss: 0.4816 - 136ms/epoch - 136ms/step
Sub #0.0  Vid #6
Model RMSE: 0.6939895130513942
1/1 - 0s - loss: 1.2735 - 134ms/epoch - 134ms/step
Sub #0.0  Vid #7
Model RMSE: 1.1285162123232615
1/1 - 0s - loss: 0.1944 - 130ms/epoch - 130ms/step
Sub #1.0  Vid #0
Model RMSE: 0.44092868187817275
1/1 - 0s - loss: 0.0767 - 133ms/epoch - 133ms/step
Sub #1.0  Vid #1
Model RMSE: 0.2769468314850631
1/1 - 0s 

In [35]:
np.sqrt(np.mean(mse_list)) # RMSE for validation set

1.318983333146173

In [152]:
##########################################
#### PREDICT TEST VALENCE AND AROUSAL ####
##########################################
preds_dict = []

time = np.arange(10000, 40050, 50)

for i in range(240):
  sub_no = sub_list_train[int((i-i%8)/8)]
  vid_no = vid_list_train[int(i%8)]

  print('Subject ', sub_no,'; vid ',vid_no)

  preds = model1.predict_on_batch(X_test[i])

  df = pd.DataFrame(np.column_stack((time,preds)))

  df.columns = ['time','valence','arousal']


  filename = 'data/scenario_1/test/annotations/sub_'+ str(sub_no) + '_vid_' + str(vid_no) + '.csv'
  df.to_csv(filename, index=False)
  #break


Subject  1 ; vid  1
Subject  1 ; vid  9
Subject  1 ; vid  10
Subject  1 ; vid  11
Subject  1 ; vid  13
Subject  1 ; vid  14
Subject  1 ; vid  18
Subject  1 ; vid  20
Subject  4 ; vid  1
Subject  4 ; vid  9
Subject  4 ; vid  10
Subject  4 ; vid  11
Subject  4 ; vid  13
Subject  4 ; vid  14
Subject  4 ; vid  18
Subject  4 ; vid  20
Subject  6 ; vid  1
Subject  6 ; vid  9
Subject  6 ; vid  10
Subject  6 ; vid  11
Subject  6 ; vid  13
Subject  6 ; vid  14
Subject  6 ; vid  18
Subject  6 ; vid  20
Subject  7 ; vid  1
Subject  7 ; vid  9
Subject  7 ; vid  10
Subject  7 ; vid  11
Subject  7 ; vid  13
Subject  7 ; vid  14
Subject  7 ; vid  18
Subject  7 ; vid  20
Subject  8 ; vid  1
Subject  8 ; vid  9
Subject  8 ; vid  10
Subject  8 ; vid  11
Subject  8 ; vid  13
Subject  8 ; vid  14
Subject  8 ; vid  18
Subject  8 ; vid  20
Subject  9 ; vid  1
Subject  9 ; vid  9
Subject  9 ; vid  10
Subject  9 ; vid  11
Subject  9 ; vid  13
Subject  9 ; vid  14
Subject  9 ; vid  18
Subject  9 ; vid  20
Subj