In [1]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from preprocess_data import filter_col, get_table
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [32]:
df = pd.read_csv(os.path.join('output', 'csv', '43_5.csv'))
print((df['px_1'] != -1).sum())

1


In [34]:
def check_csv(file):
    df = pd.read_csv(file)
    return False if (df['px_1'] != -1).sum() <= 1 \
    else True
    
files = glob.glob(os.path.join('output', 'csv', '*.csv'))
valid_file = list(filter(check_csv, files))

In [35]:
def extract_participants(files):
    return [os.path.basename(file).split('.')[0] for file in files]

valid_index = extract_participants(valid_file)

In [36]:
print(len(valid_index))
valid_index

65


['03_10',
 '03_5',
 '05_10',
 '08_0',
 '08_5',
 '10_10',
 '10_5',
 '17_0',
 '17_5',
 '18_0',
 '18_10',
 '18_5',
 '23_5',
 '26_0',
 '26_10',
 '27_0',
 '27_10',
 '28_10',
 '28_5',
 '31_0',
 '31_10',
 '31_5',
 '32_0',
 '32_10_1',
 '32_10_2',
 '32_5',
 '34_0',
 '34_10',
 '34_5',
 '35_0',
 '35_10',
 '35_5',
 '36_0',
 '36_10',
 '36_5',
 '37_0',
 '37_10',
 '37_5',
 '39_0',
 '39_10',
 '39_5',
 '40_0',
 '40_10',
 '40_5',
 '41_0',
 '41_5',
 '44_0',
 '44_10',
 '45_0',
 '45_10',
 '45_5',
 '46_10',
 '46_5',
 '48_10',
 '48_5',
 '49_0',
 '49_10_1',
 '50_10',
 '50_5',
 '51_0',
 '51_10',
 '51_5',
 '60_0',
 '60_10',
 '60_5']

In [37]:
def create_data(index, frame_to_keep=150, start=150, 
         train_percentage=17/22, base_path=None):
    df = get_table(index)
    df = filter_col(df)
    df.drop(df.columns[0], axis=1, inplace=True)
    train_index = int((len(df)-start)*train_percentage)
    print(f'{index}: {df.isnull().sum().sum()}')
    # get 4 features of each frame in 150 consecutive frames (15s)
    # and concat it into a vector
    X = [df[index-frame_to_keep:index].to_numpy().reshape(frame_to_keep*4)
              for index in range(start, len(df))]
    mood = index.split('_')[1]
    y = np.full((len(df)-start, 1), mood, dtype=int).squeeze()
    y[y != 0] = 1
    X_train = np.array(X[:train_index])
    X_test = np.array(X[train_index:])
    
    y_train = y[:train_index]
    y_test = y[train_index:]
    return X_train, y_train, X_test, y_test

In [5]:
def train(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print('Accuracy:', f'{acc*100}%')
    print(confusion_matrix(y_test, y_pred))

In [39]:
def pipeline(model, val=valid_index):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for index in val:
        X_train1, y_train1, X_test1, y_test1 = create_data(index)
        
        X_train += [X_train1]
        y_train += [y_train1]
        X_test += [X_test1]
        y_test += [y_test1]
        

    X_train = np.concatenate(X_train, axis = 0)
    y_train = np.concatenate(y_train, axis = 0)
    X_test = np.concatenate(X_test, axis = 0)
    y_test = np.concatenate(y_test, axis = 0)
    
    print(f'Train data: {X_train.shape}')
    print(f'Test data: {X_test.shape}')
    
    train(model, X_train, y_train, X_test, y_test)
    return model, X_test, y_test

In [10]:
def pipeline(model, index=valid_index):
    X_train = ()
    y_train = ()
    X_test = ()
    y_test = ()
    for index in valid_index:
        X_train1, y_train1, X_test1, y_test1 = create_data(participant, 0)
        X_train2, y_train2, X_test2, y_test2 = create_data(participant, 5)
        X_train3, y_train3, X_test3, y_test3 = create_data(participant, 10)
        
        X_train += (X_train1, X_train2, X_train3)
        y_train += (y_train1, y_train2, y_train3)
        X_test += (X_test1, X_test2, X_test3)
        y_test += (y_test1, y_test2, y_test3)
        

    X_train = np.concatenate(X_train, axis = 0)
    y_train = np.concatenate(y_train, axis = 0)
    X_test = np.concatenate(X_test, axis = 0)
    y_test = np.concatenate(y_test, axis = 0)
    
    train(model, X_train, y_train, X_test, y_test)
    return model, X_test, y_test

In [40]:
from sklearn.svm import SVC
svm = SVC()

model, X_test, y_test = pipeline(svm)

03_10: 0
03_5: 0
05_10: 0
08_0: 0
08_5: 0
10_10: 0
10_5: 0
17_0: 0
17_5: 0
18_0: 0
18_10: 0
18_5: 0
23_5: 0
26_0: 0
26_10: 0
27_0: 0
27_10: 0
28_10: 0
28_5: 0
31_0: 0
31_10: 0
31_5: 0
32_0: 0
32_10_1: 0
32_10_2: 0
32_5: 0
34_0: 0
34_10: 0
34_5: 0
35_0: 0
35_10: 0
35_5: 0
36_0: 0
36_10: 0
36_5: 0
37_0: 0
37_10: 0
37_5: 0
39_0: 0
39_10: 0
39_5: 0
40_0: 0
40_10: 0
40_5: 0
41_0: 0
41_5: 0
44_0: 0
44_10: 0
45_0: 0
45_10: 0
45_5: 0
46_10: 0
46_5: 0
48_10: 0
48_5: 0
49_0: 0
49_10_1: 0
50_10: 0
50_5: 0
51_0: 0
51_10: 0
51_5: 0
60_0: 0
60_10: 0
60_5: 0
Train data: (138902, 600)
Test data: (40877, 600)
Accuracy: 67.37529662157203%
[[ 1575 10137]
 [ 3199 25966]]


In [None]:
filename = 'final_model.sav'
pickle.dump(model, open(filename, 'wb'))