This notebook generates the training data for all models (PCA, KMeans, Autoencoder) used during analysis

# All Data

In [6]:
task_name = 'all_data'

In [7]:
model_path = './models/{0}/'.format(task_name)

In [8]:
import os
import numpy as np
import pandas as pd

import warnings
from tqdm import tqdm_notebook as tqdm

In [9]:
path = './data/openface'
tasks = ['task1_sandwich_openface','task2_bart_openface','task3_jenga_openface']

files = []
for task in tasks:
    subpath = os.path.join(path,task)
    files += [os.path.join(task,f) for f in os.listdir(subpath) if os.path.isfile(os.path.join(subpath,f))]

print('Num Files:',len(files))

Num Files: 93


In [10]:
from sklearn.model_selection import train_test_split

indicies = ['frame'] + ['x_{0}'.format(i) for i in range(0,68)] + ['y_{0}'.format(i) for i in range(0,68)]

flag = True
Train = None
Test = None
Valid = None

for i in tqdm(range(0,len(files))):
    
    f = files[i]
    
    # Load CSV
    print(f)
    df = pd.read_csv(
            os.path.join(path,f), 
            error_bad_lines=False, 
            warn_bad_lines=True)
    
    face_lmk = df[indicies]
    
    # Filter NaN rows
    nans = df[df[indicies].isnull().any(axis=1)]
    if not nans.empty:
        dropList = nans.index.tolist()
        for r in dropList:
            warnings.warn('Dropping row due to NaN, {0}'.format(r))
        face_lmk.drop(dropList, inplace=True)
        
    face_lmk.insert(0,'filename',[f for i in range(0,face_lmk.shape[0])],True)
        
    print('Full:',face_lmk.shape)
    
    # Create Training, Test, Validation subsets
    train, tmp = train_test_split(face_lmk, test_size=0.33)
    test, val = train_test_split(tmp, test_size=0.5)
    print('Train:',train.shape,'Test:',test.shape,'Validation',val.shape)
    
    if flag:
        flag = False
        Train = train
        Test = test
        Valid = val
    else:
        Train = Train.append(train)
        Test = Test.append(test)
        Valid = Valid.append(val)

# Final Dataset (shuffled)
Train = Train.sample(frac=1).reset_index(drop=True)
Test = Test.sample(frac=1).reset_index(drop=True)
Valid = Valid.sample(frac=1).reset_index(drop=True)

print('\n\nFINAL')
print('Train:',Train.shape,'Test:',Test.shape,'Validation',Valid.shape)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/93 [00:00<?, ?it/s]

task1_sandwich_openface/0101_sandwich_cut.csv
Full: (5732, 138)
Train: (3840, 138) Test: (946, 138) Validation (946, 138)
task1_sandwich_openface/0102_sandwich_cut.csv
Full: (5692, 138)
Train: (3813, 138) Test: (939, 138) Validation (940, 138)
task1_sandwich_openface/0201_sandwich_cut.csv
Full: (9953, 138)
Train: (6668, 138) Test: (1642, 138) Validation (1643, 138)
task1_sandwich_openface/0301_sandwich_cut.csv
Full: (8261, 138)
Train: (5534, 138) Test: (1363, 138) Validation (1364, 138)
task1_sandwich_openface/0302_sandwich_cut.csv
Full: (8333, 138)
Train: (5583, 138) Test: (1375, 138) Validation (1375, 138)
task1_sandwich_openface/0402_sandwich_cut.csv
Full: (4974, 138)
Train: (3332, 138) Test: (821, 138) Validation (821, 138)
task1_sandwich_openface/0501_sandwich_cut.csv
Full: (9426, 138)
Train: (6315, 138) Test: (1555, 138) Validation (1556, 138)
task1_sandwich_openface/0502_sandwich_cut.csv
Full: (9457, 138)
Train: (6336, 138) Test: (1560, 138) Validation (1561, 138)
task1_sandwich

b'Skipping line 4053: expected 714 fields, saw 1233\nSkipping line 4064: expected 714 fields, saw 722\n'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Full: (6570, 138)
Train: (4401, 138) Test: (1084, 138) Validation (1085, 138)
task1_sandwich_openface/1401_sandwich_cut.csv
Full: (7476, 138)
Train: (5008, 138) Test: (1234, 138) Validation (1234, 138)
task1_sandwich_openface/1402_sandwich_cut.csv
Full: (7518, 138)
Train: (5037, 138) Test: (1240, 138) Validation (1241, 138)
task1_sandwich_openface/1501_sandwich_cut.csv
Full: (7224, 138)
Train: (4840, 138) Test: (1192, 138) Validation (1192, 138)
task1_sandwich_openface/1502_sandwich_cut.csv
Full: (7219, 138)
Train: (4836, 138) Test: (1191, 138) Validation (1192, 138)
task1_sandwich_openface/1701_sandwich_Cut.csv
Full: (6185, 138)
Train: (4143, 138) Test: (1021, 138) Validation (1021, 138)
task1_sandwich_openface/1702_sandwich_Cut.csv
Full: (6674, 138)
Train: (4471, 138) Test: (1101, 138) Validation (1102, 138)
task1_sandwich_openface/Sona0101_sandwich_cut.csv
Full: (4700, 138)
Train: (3149, 138) Test: (775, 138) Validation (776, 138)
task1_sandwich_openface/Sona0102_sandwich_cut.csv
Fu

task3_jenga_openface/Sona0101_jenga_cut.csv
Full: (7591, 138)
Train: (5085, 138) Test: (1253, 138) Validation (1253, 138)
task3_jenga_openface/Sona0102_jenga_cut.csv
Full: (7790, 138)
Train: (5219, 138) Test: (1285, 138) Validation (1286, 138)
task3_jenga_openface/Sona0201_jenga_cut.csv
Full: (8814, 138)
Train: (5905, 138) Test: (1454, 138) Validation (1455, 138)
task3_jenga_openface/Sona0202_jenga_cut.csv
Full: (8981, 138)
Train: (6017, 138) Test: (1482, 138) Validation (1482, 138)
task3_jenga_openface/Sona0302_jenga_cut.csv
Full: (9019, 138)
Train: (6042, 138) Test: (1488, 138) Validation (1489, 138)
task3_jenga_openface/Sona_jenga_0301.csv
Full: (9043, 138)
Train: (6058, 138) Test: (1492, 138) Validation (1493, 138)


FINAL
Train: (712824, 138) Test: (175560, 138) Validation (175605, 138)


In [11]:
Train.head()

Unnamed: 0,filename,frame,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,y_58,y_59,y_60,y_61,y_62,y_63,y_64,y_65,y_66,y_67
0,task2_bart_openface/1002_bart_cut.csv,3424.0,700.9,692.0,689.1,693.5,698.0,702.8,708.0,717.1,...,644.2,639.2,629.8,631.1,632.4,632.8,637.7,634.2,632.5,630.4
1,task3_jenga_openface/Sona_jenga_0301.csv,7141.0,798.2,795.4,798.1,805.1,819.6,844.2,872.6,906.8,...,851.9,844.6,826.4,825.2,827.5,826.4,829.1,825.5,826.5,824.2
2,task1_sandwich_openface/Sona0201_sandwich_cut.csv,2281.0,472.1,476.8,487.0,501.8,517.5,536.3,554.5,576.6,...,584.1,577.4,561.1,563.2,563.9,561.4,558.5,561.8,564.6,563.4
3,task3_jenga_openface/Sona0302_jenga_cut.csv,4928.0,445.6,432.7,425.4,423.9,428.0,444.0,457.4,475.4,...,953.1,932.7,908.4,925.2,936.9,943.8,949.8,944.5,938.0,926.1
4,task2_bart_openface/1702_Bart_Cut.csv,17328.0,713.7,716.9,727.3,747.3,774.4,808.5,843.8,880.6,...,620.8,616.3,593.5,576.6,571.8,559.2,539.6,582.7,594.5,598.7


In [12]:
Test.head()

Unnamed: 0,filename,frame,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,y_58,y_59,y_60,y_61,y_62,y_63,y_64,y_65,y_66,y_67
0,task2_bart_openface/1001_bart_cut.csv,18433.0,814.5,812.7,817.0,828.2,840.8,855.9,872.8,891.4,...,607.2,601.0,582.2,574.5,574.1,571.0,571.2,590.7,594.1,594.0
1,task1_sandwich_openface/0502_sandwich_cut.csv,6370.0,392.8,397.9,408.9,428.6,453.2,482.0,508.5,536.3,...,825.8,814.3,788.4,794.4,796.3,792.7,778.8,795.7,798.2,795.2
2,task2_bart_openface/Sona0102_bart_cut.csv,11546.0,981.1,980.9,984.5,991.8,1001.7,1018.0,1035.9,1055.4,...,824.6,812.3,795.4,806.2,810.9,810.4,803.6,813.8,814.1,809.1
3,task1_sandwich_openface/0301_sandwich_cut.csv,1252.0,741.3,743.4,753.1,764.0,764.3,753.8,735.9,718.4,...,998.0,989.4,975.6,979.1,983.3,984.0,988.6,987.5,986.5,982.1
4,task2_bart_openface/0601_bart_cut.csv,5148.0,220.2,217.0,226.3,245.8,266.0,289.6,314.0,346.5,...,987.0,976.3,953.8,949.7,952.0,951.8,955.2,955.1,955.2,952.2


In [13]:
Valid.head()

Unnamed: 0,filename,frame,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,y_58,y_59,y_60,y_61,y_62,y_63,y_64,y_65,y_66,y_67
0,task2_bart_openface/0701_bart_cut.csv,4421.0,757.0,760.9,769.5,784.9,804.4,834.8,880.4,938.3,...,856.7,853.5,830.1,823.3,821.1,816.0,806.4,822.1,827.2,828.6
1,task1_sandwich_openface/0101_sandwich_cut.csv,5052.0,656.3,653.2,655.9,667.3,679.9,688.4,693.6,701.3,...,427.7,425.5,417.9,416.0,415.5,414.3,407.9,416.2,416.5,416.2
2,task2_bart_openface/0702_bart_cut.csv,12203.0,827.9,827.7,834.0,849.6,864.7,885.6,906.0,928.5,...,634.3,609.4,576.8,592.7,600.1,603.3,595.9,625.8,622.9,614.3
3,task2_bart_openface/0102_bart_cut.csv,10214.0,787.3,793.0,800.9,810.4,811.9,817.3,819.8,827.6,...,879.4,884.4,880.3,862.2,857.0,850.4,844.9,855.7,861.9,866.8
4,task2_bart_openface/1702_Bart_Cut.csv,14858.0,670.1,675.6,689.4,709.2,732.6,764.1,798.5,836.8,...,666.6,664.3,652.6,641.1,639.2,632.9,623.6,633.3,638.1,639.3


In [14]:
save_path = './data/tmp_analysis/{0}'.format(task_name)
save_train_path = save_path + '_train.csv'
save_test_path = save_path + '_test.csv'
save_valid_path = save_path + '_valid.csv'


Train.to_csv(save_train_path)
Test.to_csv(save_test_path)
Valid.to_csv(save_valid_path)

# Task 1 Only

In [None]:
task_name = 'task1_sandwich_openface'

In [None]:
model_path = './models/{0}/'.format(task_name)

In [None]:
import os
import numpy as np
import pandas as pd

import warnings
from tqdm import tqdm_notebook as tqdm

path = './data/openface/{0}'.format(task_name)

files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f))]
print('Num Files:',len(files))

In [None]:
from sklearn.model_selection import train_test_split

indicies = ['frame'] + ['x_{0}'.format(i) for i in range(0,68)] + ['y_{0}'.format(i) for i in range(0,68)]

flag = True
Train = None
Test = None
Valid = None

for i in tqdm(range(0,len(files))):
    
    f = files[i]
    
    # Load CSV
    print(f)
    df = pd.read_csv(
            os.path.join(path,f), 
            error_bad_lines=False, 
            warn_bad_lines=True)
    
    face_lmk = df[indicies]
    
    # Filter NaN rows
    nans = df[df[indicies].isnull().any(axis=1)]
    if not nans.empty:
        dropList = nans.index.tolist()
        for r in dropList:
            warnings.warn('Dropping row due to NaN, {0}'.format(r))
        face_lmk.drop(dropList, inplace=True)
        
    face_lmk.insert(0,'filename',[f for i in range(0,face_lmk.shape[0])],True)
        
    print('Full:',face_lmk.shape)
    
    # Create Training, Test, Validation subsets
    train, tmp = train_test_split(face_lmk, test_size=0.33)
    test, val = train_test_split(tmp, test_size=0.5)
    print('Train:',train.shape,'Test:',test.shape,'Validation',val.shape)
    
    if flag:
        flag = False
        Train = train
        Test = test
        Valid = val
    else:
        Train = Train.append(train)
        Test = Test.append(test)
        Valid = Valid.append(val)

# Final Dataset (shuffled)
Train = Train.sample(frac=1).reset_index(drop=True)
Test = Test.sample(frac=1).reset_index(drop=True)
Valid = Valid.sample(frac=1).reset_index(drop=True)

print('\n\nFINAL')
print('Train:',Train.shape,'Test:',Test.shape,'Validation',Valid.shape)

In [None]:
Train.head()

In [None]:
Test.head()

In [None]:
Valid.head()

In [None]:
save_path = './data/tmp_analysis/{0}'.format(task_name)
save_train_path = save_path + '_train.csv'
save_test_path = save_path + '_test.csv'
save_valid_path = save_path + '_valid.csv'


Train.to_csv(save_train_path)
Test.to_csv(save_test_path)
Valid.to_csv(save_valid_path)

# Task 2 Only

In [None]:
task_name = 'task2_bart_openface'

In [None]:
model_path = './models/{0}/'.format(task_name)

In [None]:
import os
import numpy as np
import pandas as pd

import warnings
from tqdm import tqdm_notebook as tqdm

path = './data/openface/{0}'.format(task_name)

files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f))]
print('Num Files:',len(files))

In [None]:
from sklearn.model_selection import train_test_split

indicies = ['frame'] + ['x_{0}'.format(i) for i in range(0,68)] + ['y_{0}'.format(i) for i in range(0,68)]

flag = True
Train = None
Test = None
Valid = None

for i in tqdm(range(0,len(files))):
    
    f = files[i]
    
    # Load CSV
    print(f)
    df = pd.read_csv(
            os.path.join(path,f), 
            error_bad_lines=False, 
            warn_bad_lines=True)
    
    face_lmk = df[indicies]
    
    # Filter NaN rows
    nans = df[df[indicies].isnull().any(axis=1)]
    if not nans.empty:
        dropList = nans.index.tolist()
        for r in dropList:
            warnings.warn('Dropping row due to NaN, {0}'.format(r))
        face_lmk.drop(dropList, inplace=True)
        
    face_lmk.insert(0,'filename',[f for i in range(0,face_lmk.shape[0])],True)
        
    print('Full:',face_lmk.shape)
    
    # Create Training, Test, Validation subsets
    train, tmp = train_test_split(face_lmk, test_size=0.33)
    test, val = train_test_split(tmp, test_size=0.5)
    print('Train:',train.shape,'Test:',test.shape,'Validation',val.shape)
    
    if flag:
        flag = False
        Train = train
        Test = test
        Valid = val
    else:
        Train = Train.append(train)
        Test = Test.append(test)
        Valid = Valid.append(val)

# Final Dataset (shuffled)
Train = Train.sample(frac=1).reset_index(drop=True)
Test = Test.sample(frac=1).reset_index(drop=True)
Valid = Valid.sample(frac=1).reset_index(drop=True)

print('\n\nFINAL')
print('Train:',Train.shape,'Test:',Test.shape,'Validation',Valid.shape)

In [None]:
Train.head()

In [None]:
Test.head()

In [None]:
Valid.head()

In [None]:
save_path = './data/tmp_analysis/{0}'.format(task_name)
save_train_path = save_path + '_train.csv'
save_test_path = save_path + '_test.csv'
save_valid_path = save_path + '_valid.csv'


Train.to_csv(save_train_path)
Test.to_csv(save_test_path)
Valid.to_csv(save_valid_path)

# Task 3 Only

In [None]:
task_name = 'task3_jenga_openface'

In [None]:
model_path = './models/{0}/'.format(task_name)

In [None]:
import os
import numpy as np
import pandas as pd

import warnings
from tqdm import tqdm_notebook as tqdm

path = './data/openface/{0}'.format(task_name)

files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f))]
print('Num Files:',len(files))

In [None]:
from sklearn.model_selection import train_test_split

indicies = ['frame'] + ['x_{0}'.format(i) for i in range(0,68)] + ['y_{0}'.format(i) for i in range(0,68)]

flag = True
Train = None
Test = None
Valid = None

for i in tqdm(range(0,len(files))):
    
    f = files[i]
    
    # Load CSV
    print(f)
    df = pd.read_csv(
            os.path.join(path,f), 
            error_bad_lines=False, 
            warn_bad_lines=True)
    
    face_lmk = df[indicies]
    
    # Filter NaN rows
    nans = df[df[indicies].isnull().any(axis=1)]
    if not nans.empty:
        dropList = nans.index.tolist()
        for r in dropList:
            warnings.warn('Dropping row due to NaN, {0}'.format(r))
        face_lmk.drop(dropList, inplace=True)
        
    face_lmk.insert(0,'filename',[f for i in range(0,face_lmk.shape[0])],True)
        
    print('Full:',face_lmk.shape)
    
    # Create Training, Test, Validation subsets
    train, tmp = train_test_split(face_lmk, test_size=0.33)
    test, val = train_test_split(tmp, test_size=0.5)
    print('Train:',train.shape,'Test:',test.shape,'Validation',val.shape)
    
    if flag:
        flag = False
        Train = train
        Test = test
        Valid = val
    else:
        Train = Train.append(train)
        Test = Test.append(test)
        Valid = Valid.append(val)

# Final Dataset (shuffled)
Train = Train.sample(frac=1).reset_index(drop=True)
Test = Test.sample(frac=1).reset_index(drop=True)
Valid = Valid.sample(frac=1).reset_index(drop=True)

print('\n\nFINAL')
print('Train:',Train.shape,'Test:',Test.shape,'Validation',Valid.shape)

In [None]:
Train.head()

In [None]:
Test.head()

In [None]:
Valid.head()

In [None]:
save_path = './data/tmp_analysis/{0}'.format(task_name)
save_train_path = save_path + '_train.csv'
save_test_path = save_path + '_test.csv'
save_valid_path = save_path + '_valid.csv'


Train.to_csv(save_train_path)
Test.to_csv(save_test_path)
Valid.to_csv(save_valid_path)