This notebook generates the training data for all models (PCA, KMeans, Autoencoder) used during analysis

# All Data

In [1]:
task_name = 'alt_all_data'

In [2]:
import os
import numpy as np
import pandas as pd

import warnings
from tqdm import tqdm_notebook as tqdm

In [3]:
path = './data/alt_openface_old_protocol'
tasks = ['Task1','Task2','Task3']

files = []
for task in tasks:
    subpath = os.path.join(path,task)
    files += [os.path.join(task,f) for f in os.listdir(subpath) if os.path.isfile(os.path.join(subpath,f))]

print('Num Files:',len(files))

Num Files: 223


In [4]:
from sklearn.model_selection import train_test_split

indicies = ['frame'] + ['x_{0}'.format(i) for i in range(0,68)] + ['y_{0}'.format(i) for i in range(0,68)]

flag = True
Train = None
Test = None
Valid = None

for i in tqdm(range(0,len(files))):
    
    f = files[i]
    
    # Load CSV
    print(f)
    df = pd.read_csv(
            os.path.join(path,f), 
            error_bad_lines=False, 
            warn_bad_lines=True)
    df.columns = df.columns.str.replace(' ', '')
    
    try:
        face_lmk = df[indicies]
    except Exception as e:
        print(e)
        print(list(df.columns.values))
    
    # Filter NaN rows
    nans = df[df[indicies].isnull().any(axis=1)]
    if not nans.empty:
        dropList = nans.index.tolist()
        for r in dropList:
            warnings.warn('Dropping row due to NaN, {0}'.format(r))
        face_lmk.drop(dropList, inplace=True)
        
    face_lmk.insert(0,'filename',[f for i in range(0,face_lmk.shape[0])],True)
        
    print('Full:',face_lmk.shape)
    
    # Create Training, Test, Validation subsets
    train, tmp = train_test_split(face_lmk, test_size=0.33)
    test, val = train_test_split(tmp, test_size=0.5)
    print('Train:',train.shape,'Test:',test.shape,'Validation',val.shape)
    
    if flag:
        flag = False
        Train = train
        Test = test
        Valid = val
    else:
        Train = Train.append(train)
        Test = Test.append(test)
        Valid = Valid.append(val)

# Final Dataset (shuffled)
Train = Train.sample(frac=1).reset_index(drop=True)
Test = Test.sample(frac=1).reset_index(drop=True)
Valid = Valid.sample(frac=1).reset_index(drop=True)

print('\n\nFINAL')
print('Train:',Train.shape,'Test:',Test.shape,'Validation',Valid.shape)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/223 [00:00<?, ?it/s]

Task1/01_0_1.csv
Full: (4396, 138)
Train: (2945, 138) Test: (725, 138) Validation (726, 138)
Task1/02_0_1.csv
Full: (4404, 138)
Train: (2950, 138) Test: (727, 138) Validation (727, 138)
Task1/03_1_1.csv
Full: (5314, 138)
Train: (3560, 138) Test: (877, 138) Validation (877, 138)
Task1/04_1_1.csv
Full: (5391, 138)
Train: (3611, 138) Test: (890, 138) Validation (890, 138)
Task1/05_1_1.csv
Full: (8967, 138)
Train: (6007, 138) Test: (1480, 138) Validation (1480, 138)
Task1/07_0_1.csv
Full: (8283, 138)
Train: (5549, 138) Test: (1367, 138) Validation (1367, 138)
Task1/08_0_1.csv
Full: (7886, 138)
Train: (5283, 138) Test: (1301, 138) Validation (1302, 138)
Task1/09_0_1.csv
Full: (6800, 138)
Train: (4556, 138) Test: (1122, 138) Validation (1122, 138)
Task1/100_0_1.csv
Full: (4289, 138)
Train: (2873, 138) Test: (708, 138) Validation (708, 138)
Task1/101_0_1.csv
Full: (4370, 138)
Train: (2927, 138) Test: (721, 138) Validation (722, 138)
Task1/102_0_1.csv
Full: (4258, 138)
Train: (2852, 138) Test:

Task2/09_0_2.csv
Full: (10915, 138)
Train: (7313, 138) Test: (1801, 138) Validation (1801, 138)
Task2/101_0_2.csv
Full: (5468, 138)
Train: (3663, 138) Test: (902, 138) Validation (903, 138)
Task2/102_0_2.csv
Full: (5478, 138)
Train: (3670, 138) Test: (904, 138) Validation (904, 138)
Task2/103_1_2.csv
Full: (9139, 138)
Train: (6123, 138) Test: (1508, 138) Validation (1508, 138)
Task2/104_1_2.csv
Full: (9102, 138)
Train: (6098, 138) Test: (1502, 138) Validation (1502, 138)
Task2/105_0_2.csv
Full: (17803, 138)
Train: (11928, 138) Test: (2937, 138) Validation (2938, 138)
Task2/106_0_2.csv
Full: (17742, 138)
Train: (11887, 138) Test: (2927, 138) Validation (2928, 138)
Task2/107_1_2.csv
Full: (14499, 138)
Train: (9714, 138) Test: (2392, 138) Validation (2393, 138)
Task2/108_1_2.csv
Full: (14330, 138)
Train: (9601, 138) Test: (2364, 138) Validation (2365, 138)
Task2/10_0_2.csv
Full: (10801, 138)
Train: (7236, 138) Test: (1782, 138) Validation (1783, 138)
Task2/15_0_2.csv
Full: (15587, 138)
Tr

Task3/26_1_3.csv
Full: (8963, 138)
Train: (6005, 138) Test: (1479, 138) Validation (1479, 138)
Task3/27_0_3.csv
Full: (9215, 138)
Train: (6174, 138) Test: (1520, 138) Validation (1521, 138)
Task3/28_0_3.csv
Full: (9178, 138)
Train: (6149, 138) Test: (1514, 138) Validation (1515, 138)
Task3/29_1_3.csv
Full: (8835, 138)
Train: (5919, 138) Test: (1458, 138) Validation (1458, 138)
Task3/30_1_3.csv
Full: (8856, 138)
Train: (5933, 138) Test: (1461, 138) Validation (1462, 138)
Task3/31_1_3.csv
Full: (8930, 138)
Train: (5983, 138) Test: (1473, 138) Validation (1474, 138)
Task3/32_1_3.csv
Full: (8824, 138)
Train: (5912, 138) Test: (1456, 138) Validation (1456, 138)
Task3/35_1_3.csv
Full: (9009, 138)
Train: (6036, 138) Test: (1486, 138) Validation (1487, 138)
Task3/36_1_3.csv
Full: (5637, 138)
Train: (3776, 138) Test: (930, 138) Validation (931, 138)
Task3/37_0_3.csv
Full: (8967, 138)
Train: (6007, 138) Test: (1480, 138) Validation (1480, 138)
Task3/38_0_3.csv
Full: (8930, 138)
Train: (5983, 138

In [5]:
Train.head()

Unnamed: 0,filename,frame,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,y_58,y_59,y_60,y_61,y_62,y_63,y_64,y_65,y_66,y_67
0,Task3/60_1_3.csv,1373,404.555,401.43,400.485,402.306,407.21,416.87,429.42,444.36,...,628.345,620.622,607.475,611.181,613.607,614.268,615.297,620.542,620.016,617.4
1,Task3/76_1_3.csv,7602,237.776,237.752,240.315,244.228,252.111,267.379,287.687,311.492,...,685.459,676.209,660.563,657.141,659.99,660.159,666.139,670.698,670.479,667.159
2,Task2/44_1_2.csv,4029,775.189,771.471,772.501,776.772,777.276,778.777,780.727,786.34,...,473.38,468.392,460.129,456.621,458.218,458.067,463.795,460.822,460.339,457.962
3,Task1/36_1_1.csv,3715,784.563,783.634,787.208,794.842,802.974,813.964,824.78,841.008,...,515.601,509.953,499.575,501.388,502.587,502.292,503.342,503.797,503.625,502.357
4,Task3/39_0_3.csv,589,672.413,686.212,703.018,724.545,759.962,798.916,837.138,883.66,...,816.567,820.823,814.941,799.056,793.615,785.255,772.065,790.053,799.035,804.719


In [6]:
Test.head()

Unnamed: 0,filename,frame,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,y_58,y_59,y_60,y_61,y_62,y_63,y_64,y_65,y_66,y_67
0,Task2/87_1_2.csv,3392,430.175,430.126,433.957,440.151,444.71,450.061,456.041,463.564,...,439.267,436.095,429.5,426.723,427.142,426.072,426.632,428.179,429.062,428.156
1,Task2/56_0_2.csv,8562,1156.89,1160.15,1168.36,1182.68,1200.43,1223.25,1250.3,1282.17,...,676.593,688.996,699.576,669.012,664.38,660.769,686.944,667.031,670.631,675.103
2,Task2/24_1_2.csv,2568,620.756,620.302,624.151,633.391,646.784,663.19,682.528,704.926,...,490.404,477.492,453.228,454.369,457.153,455.203,456.22,473.905,475.533,471.68
3,Task1/15_0_1.csv,6170,777.888,772.38,771.702,776.387,780.935,786.789,793.432,803.042,...,580.831,578.565,572.691,568.731,568.789,567.455,568.925,569.478,570.065,569.232
4,Task1/52_1_1.csv,2779,811.952,812.464,816.311,825.059,834.357,850.497,866.28,886.488,...,543.764,537.051,526.041,528.072,531.184,531.406,528.463,531.695,531.729,528.364


In [7]:
Valid.head()

Unnamed: 0,filename,frame,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,y_58,y_59,y_60,y_61,y_62,y_63,y_64,y_65,y_66,y_67
0,Task3/11_0_3.csv,3836,418.719,420.463,423.483,427.405,432.382,438.96,446.259,453.954,...,403.213,402.832,399.3,396.289,395.733,394.144,392.556,395.651,397.218,397.697
1,Task3/37_0_3.csv,7588,833.998,833.436,837.18,841.604,846.929,855.454,863.699,875.771,...,867.055,858.906,846.423,853.145,856.016,856.688,860.449,858.865,858.267,854.593
2,Task3/51_1_3.csv,6962,779.146,778.372,782.651,791.977,802.551,815.555,832.259,854.179,...,1021.16,1019.11,1006.04,1008.24,1008.8,1006.91,1001.71,1008.47,1009.38,1007.77
3,Task3/60_1_3.csv,3970,352.368,353.072,355.795,360.58,368.474,380.35,394.062,408.004,...,607.093,602.012,593.396,596.238,597.58,596.653,592.685,597.908,598.625,597.047
4,Task2/97_0_2.csv,1411,734.979,738.396,746.021,756.921,776.61,803.812,832.798,865.327,...,815.28,813.997,805.553,800.371,800.034,797.212,798.662,796.653,800.105,800.838


In [8]:
save_path = './data/tmp_analysis/{0}'.format(task_name)
save_train_path = save_path + '_train.csv'
save_test_path = save_path + '_test.csv'
save_valid_path = save_path + '_valid.csv'


Train.to_csv(save_train_path)
Test.to_csv(save_test_path)
Valid.to_csv(save_valid_path)