In [1]:
# Load original dataset
import pandas as pd

file_path = r'../../data/lm3.csv'
data_df = pd.read_csv(file_path)
print('Num of features: %s' % len(data_df.columns.values))
data_df.head()

Num of features: 79


Unnamed: 0,Label,Outer left eyebrow-x,Outer left eyebrow-y,Outer left eyebrow-z,Middle left eyebrow-x,Middle left eyebrow-y,Middle left eyebrow-z,Inner left eyebrow-x,Inner left eyebrow-y,Inner left eyebrow-z,...,Lower lip outer middle-z,Chin middle-x,Chin middle-y,Chin middle-z,Left ear lobe-x,Left ear lobe-y,Left ear lobe-z,Right ear lobe-x,Right ear lobe-y,Right ear lobe-z
0,ANGER,-72.961,-1.725,22.958,-55.678,4.591,38.791,-31.92,-1.929,36.645,...,39.421,-9.84,-112.234,31.313,,,,,,
1,DISGUST,-76.565,-0.458,6.126,-62.086,9.454,24.055,-35.614,2.066,25.073,...,24.378,-13.583,-109.568,19.583,,,,,,
2,FEAR,-76.163,6.39,10.784,-57.083,16.865,30.162,-33.708,14.082,32.408,...,42.1,-13.02,-110.107,33.429,,,,,,
3,HAPPY,-72.14,8.896,9.353,-54.721,22.38,27.474,-29.789,15.802,28.937,...,38.018,-9.03,-97.687,36.058,,,,,,
4,SADNESS,-73.743,3.099,12.438,-58.607,15.144,29.381,-32.979,11.594,30.048,...,41.452,-10.731,-100.452,35.956,,,,,,


In [7]:
from random import uniform

def apply_rand(x):
    '''
    Applies some noise to a datacell, this is probably best kept <0.1
    '''
    if type(x) is str:
        return x
    return x + uniform(-0.05, 0.05)

def augment_data(dataframe, n=2):
    '''
    Mulitplies the size of the passed dataframe by n
    '''
    if n < 1: 
        raise ValueError('n must be >= 1')
    modded = dataframe.copy() 
    for i in range(1,n):
        frames = [dataframe, modded.applymap(apply_rand)]
        dataframe = pd.concat(frames, axis = 0)
    return dataframe

In [8]:
def get_train_test_split(df, column_names):
    '''
    Split data into training and test sets
    '''
    # Shuffle data frame
    df = df.sample(frac=1)
    # Select same num per class, remaining go to test set
    rows, _ = df.shape
    num_of_inputs = int(rows * 0.8 / 6) # this was formerly the magic number 63.
    cols = ['Label'] +  column_names
    train_df, test_df = (pd.DataFrame(columns=cols), pd.DataFrame(columns=cols))
    for x in get_all_emotions() :
        train_df = train_df.append(df.loc[df['Label'] == x][0:num_of_inputs], ignore_index=True)
        test_df = test_df.append(df.loc[df['Label'] == x][num_of_inputs:], ignore_index=True)

    # Shuffle data frames
    train_df = train_df.sample(frac=1)
    test_df = test_df.sample(frac=1)
    
    return (train_df, test_df)

In [9]:
from preprocess import preprocess_data, reduce_features

column_names, principal_labels_df = reduce_features(preprocess_data(data_df))
# rows, _ = principal_labels_df.shape
# print('before %d' % rows)
# principal_labels_df = augment_data(principal_labels_df, 8)
# rows, _ = principal_labels_df.shape
# print('after: %d' % rows)

In [23]:
# Prepare data for classification
from constants import get_all_emotions

from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
    
verbose = False
runs = 5
mlp_scores = []
for index in range(runs):
    train_df, test_df = get_train_test_split(principal_labels_df, column_names)

    # augment train_df, increase size
    train_df = augment_data(train_df, 4)
    
    # Split train and test labels/data
    train_data   = train_df.iloc[:,1:].values
    train_labels = train_df.iloc[:,:1].values.ravel()

    test_data   = test_df.iloc[:,1:].values
    test_labels = test_df.iloc[:,:1].values.ravel()
    
    # Just print once
    if index == 0 and verbose:
        # Take a look at the labels distribution
        print('--------------------Training--------------------')
        rows, cols = train_df.shape
        print(train_df.groupby('Label').count())
        print('Total number of inputs: %s' % rows)

        print('--------------------Testing--------------------')
        rows, cols = test_df.shape
        print(test_df.groupby('Label').count())
        print('Total number of inputs: %s' % rows)
        
    # MLP classifier to predict
    mlp = MLPClassifier(solver='adam', activation='relu', \
                        hidden_layer_sizes=(10,10,10), max_iter=2000)
    mlp.fit(train_data, train_labels)
    predicted_labels = mlp.predict(test_data)
    acc_mlp = accuracy_score(test_labels, predicted_labels)
    mlp_scores.append(acc_mlp)

print('---Data---')
print('train: %d\ntest:  %d' % (len(train_data), len(test_data)))
    
print('--------------------Result--------------------')
print('deep MLP classifier accuracy: %.02f%%' % (sum(mlp_scores) / runs))

---Data---
train: 1440
test:  93
--------------------Result--------------------
deep MLP classifier accuracy: 0.51%


## Comments

- 100 runs with this data took quite some time, you get about the same result with 10.
- By augmenting the data we can create tons more samples for supervised training. It also opens up the possibility for some severe overfitting. Don't make the dataset too large similarly don't make the noise we're applying too large either. 
- It appears the larger the dataset the better the accuracy, that is enought to be suspect about this method. Augmenting data like this should definitely not be seen as a silver bullet, especially doing it like this where we already have some tagged features. For example, if we were doing some classification on cats and we were given a dataset of [tabby cat](https://en.wikipedia.org/wiki/Tabby_cat) landmarks, data augmentation wouldn't help at all for making a generalized model for all cats, just tabbey's. Supplying another cat breed to the dataset would not do much help. Back to our case, we only have maybe a dozen(?) distinct faces, can we create every face in existence from these faces? Probably definitely not. This method is flawed and this notebook is a demonstration in poor data-practices at best.