# Import dependencies and determine working directory

In [1]:
# Import libraries
import numpy as np
from os import path
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [2]:
# get current directory
dir = path.dirname(path.abspath('__file__'))

# Load processed dataframe

In [3]:
# Load df from a csv
processed_image_csv_path = path.join('..' ,'04_processed_data','processed_images.csv')
df = pd.read_csv(processed_image_csv_path, index_col = False)
df['emotion'] = df['emotion'].astype('category')

In [4]:
# Get number of unique sequences
unique_sequences = df.drop_duplicates(subset=['sequence'])

In [5]:
# define function to create a dictionary of images for sequences
def create_max_images_in_sequence_dict(df, unique_sequences):
    max_images_dict = {}
    for sequence in unique_sequences.sequence: 
        max_image = max(df[df.sequence == sequence].image_number)
        max_images_dict[sequence] = max_image
    return max_images_dict

# define function to match images in sequence to image
def label_max_images(df, unique_sequences): 
    max_images_dict = create_max_images_in_sequence_dict(df, unique_sequences)
    df['max_images'] = df['sequence'].map(max_images_dict)
    return df

In [6]:
# create new column of max_images in df
df = label_max_images(df, unique_sequences)

In [7]:
# Find Neutral images - first 2 images in sequence
neutral_filter = df.image_number <= 2

# Find Expressive images - last 2 images in sequence
non_neutral_filter = df.image_number >= (df.max_images-1)

In [8]:
mini_df = df[neutral_filter | non_neutral_filter]

# Split train and test data

In [9]:
# Split train and test data by sequences instead of images
sequence_train, sequence_test, _ , _ = train_test_split(unique_sequences.sequence, unique_sequences.emotion, test_size = 0.1, random_state = 42)

In [10]:
# Filter dataframe into test and train
mini_df_train = mini_df[mini_df.sequence.isin(sequence_train)]
mini_df_test = mini_df[mini_df.sequence.isin(sequence_test)]

In [12]:
# Split info into test and train series
mini_X_train = mini_df_train.processed_image_path
mini_X_test = mini_df_test.processed_image_path
mini_y_train = mini_df_train.emotion
mini_y_test = mini_df_test.emotion
mini_sequence_test =  mini_df_test.sequence
print('Number of images in training set:', len(mini_X_train) )
print(mini_y_train.value_counts())
print('Number of images in test set:', len(mini_X_test) )
print(mini_y_test.value_counts())

Number of images in training set: 1176
0    588
7    154
5    124
3    106
1     80
6     48
4     44
2     32
Name: emotion, dtype: int64
Number of images in test set: 132
0    66
5    14
7    12
3    12
1    10
6     8
4     6
2     4
Name: emotion, dtype: int64


# Import and flatten image data 

In [13]:
def flattened_image_data(image_paths):
    images_data = []
    for image_path in image_paths:
        image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2GRAY).flatten()
        images_data.append(image)
    images_data = np.array(images_data).astype(np.uint8)
    print(images_data.shape)
    return images_data

In [14]:
mini_images_train = flattened_image_data(mini_X_train)
mini_images_test = flattened_image_data(mini_X_test)

(1176, 16384)
(132, 16384)


# Save data files

In [17]:
# Save training and test data  
filename_out__mini_images_train = path.join('..', '04_processed_data','mini_images_train.pkl')
filename_out__mini_images_test = path.join('..', '04_processed_data','mini_images_test.pkl')
filename_out__mini_emotions_train = path.join('..', '04_processed_data','mini_emotions_train.pkl')
filename_out__mini_emotions_test = path.join('..', '04_processed_data','mini_emotions_test.pkl')
filename_out__mini_sequence_test = path.join('..', '04_processed_data','mini_sequence_test.pkl')
joblib.dump(mini_images_train, filename_out__mini_images_train)
joblib.dump(mini_images_test, filename_out__mini_images_test)
joblib.dump(mini_y_train, filename_out__mini_emotions_train)
joblib.dump(mini_y_test, filename_out__mini_emotions_test)
joblib.dump(mini_sequence_test, filename_out__mini_sequence_test)

['../04_processed_data/mini_sequence_test.pkl']