# Import dependencies and determine working directory

In [None]:
# Import libraries
import numpy as np
from os import path
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [2]:
# get current directory
dir = path.dirname(path.abspath('__file__'))

# Load processed dataframe

In [3]:
# Load df from a csv
processed_image_csv_path = path.join('..' ,'04_processed_data','processed_images.csv')
df = pd.read_csv(processed_image_csv_path, index_col = False)
df['emotion'] = df['emotion'].astype('category')

# Split train and test data

In [4]:
unique_sequences = df[df.image_number > 4].drop_duplicates(subset=['sequence'])

In [5]:
# Split train and test data by sequences instead of images
sequence_train, sequence_test, _ , _ = train_test_split(unique_sequences.sequence, unique_sequences.emotion, test_size = 0.1, random_state = 42)

In [6]:
# Filter dataframe into test and train
df_train = df[df.sequence.isin(sequence_train)]
df_test = df[df.sequence.isin(sequence_test)]

In [7]:
# Split info into test and train series
X_train = df_train.processed_image_path
X_test = df_test.processed_image_path
y_train = df_train.emotion
y_test = df_test.emotion
print('Number of images in training set:', len(X_train) )
print(y_train.value_counts())
print('Number of images in test set:', len(X_test) )
print(y_test.value_counts())

Number of images in training set: 5297
0    1176
5     949
7     923
1     739
3     568
4     413
6     378
2     151
Name: emotion, dtype: int64
Number of images in test set: 579
0    132
5    106
1    103
7     74
3     64
6     57
4     33
2     10
Name: emotion, dtype: int64


# Import and flatten image data 

In [8]:
def flattened_image_data(image_paths):
    images_data = []
    for image_path in image_paths:
        image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2GRAY).flatten()
        images_data.append(image)
    images_data = np.array(images_data).astype(np.uint8)
    print(images_data.shape)
    return images_data

In [9]:
images_train = flattened_image_data(X_train)
images_test = flattened_image_data(X_test)

(5297, 16384)
(579, 16384)


# Save data files

In [10]:
# Save training and test data  
filename_out__images_train = path.join('..', '04_processed_data','images_train.pkl')
filename_out__images_test = path.join('..', '04_processed_data','images_test.pkl')
filename_out__emotions_train = path.join('..', '04_processed_data','emotions_train.pkl')
filename_out__emotions_test = path.join('..', '04_processed_data','emotions_test.pkl')
joblib.dump(images_train, filename_out__images_train)
joblib.dump(images_test, filename_out__images_test)
joblib.dump(y_train, filename_out__emotions_train)
joblib.dump(y_test, filename_out__emotions_test)

['../04_processed_data/emotions_test.pkl']