In [1]:
import pandas as pd
import numpy as np
import math
import os
import matplotlib.pyplot as plt
import glob
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from preprocess_data import filter_col, get_table
from sklearn import metrics
import pickle

In [3]:
def check_csv(file):
    df = pd.read_csv(file)
    return False if (df['px_1'] != -1).sum() <= 1 \
    else True
    
files = glob.glob(os.path.join('CSV_DATA', '*.csv'))
valid_file = list(filter(check_csv, files))

In [2]:
valid_file = glob.glob(os.path.join('CSV_DATA', '*.csv'))

In [3]:
def extract_participants(files):
    return [os.path.basename(file).split('.')[0] for file in files]

valid_index = extract_participants(valid_file)

In [4]:
print(len(valid_index))

180


In [5]:
def create_data(index, drowsy_ratio, frame_to_keep=100, 
         train_percentage=17/22):
    start = frame_to_keep
    df = get_table(index)
    df = filter_col(df)
    df.drop(df.columns[0], axis=1, inplace=True)
    print(f'{index}: {df.isnull().sum().sum()}')
    # get 4 features of each frame in 450 consecutive frames (15s)
    # and concat it into a vector
    X = [df[index-frame_to_keep:index].to_numpy().reshape(frame_to_keep*8)
              for index in range(start, len(df), 10)]
    mood = index.split('_')[1]
    y = np.full((len(X), 1), mood, dtype=int).squeeze()
    if drowsy_ratio == 'half':
        y[y!=10] = 0
        y[y==10] = 1
    else:
        y[y!=0] = 1
        
    return X, y

In [3]:
time = 50
drowsy_ratio = 'half'

In [6]:
def extract_full(time, drowsy_ratio, val=valid_index):
    X = []
    y = []
    for index in val:
        X_, y_ = create_data(index, drowsy_ratio, time)
        
        X += [X_]
        y += [y_]
        

    X = np.concatenate(X, axis = 0)
    y = np.concatenate(y, axis = 0)
    
    print(f'Train data: {X.shape}')
    print(f'Test data: {y.shape}')
    
    return X, y

X, y = extract_full(time, drowsy_ratio)

10_0: 0
10_10: 0
10_5: 0
11_0: 0
11_10: 0
11_5: 0
12_0: 0
12_10: 0
12_5: 0
13_0: 0
13_10: 0
13_5: 0
14_0: 0
14_10: 0
14_5: 0
15_0: 0
15_10: 0
15_5: 0
16_0: 0
16_10: 0
16_5: 0
17_0: 0
17_10: 0
17_5: 0
18_0: 0
18_10: 0
18_5: 0
19_0: 0
19_10: 0
19_5: 0
1_0: 0
1_10: 0
1_5: 0
20_0: 0
20_10: 0
20_5: 0
21_0: 0
21_10: 0
21_5: 0
22_0: 0
22_10: 0
22_5: 0
23_0: 0
23_10: 0
23_5: 0
24_0: 0
24_10: 0
24_5: 0
25_0: 0
25_10: 0
25_5: 0
26_0: 0
26_10: 0
26_5: 0
27_0: 0
27_10: 0
27_5: 0
28_0: 0
28_10: 0
28_5: 0
29_0: 0
29_10: 0
29_5: 0
2_0: 0
2_10: 0
2_5: 0
30_0: 0
30_10: 0
30_5: 0
31_0: 0
31_10: 0
31_5: 0
32_0: 0
32_10: 0
32_5: 0
33_0: 0
33_10: 0
33_5: 0
34_0: 0
34_10: 0
34_5: 0
35_0: 0
35_10: 0
35_5: 0
36_0: 0
36_10: 0
36_5: 0
37_0: 0
37_10: 0
37_5: 0
38_0: 0
38_10: 0
38_5: 0
39_0: 0
39_10: 0
39_5: 0
3_0: 0
3_10: 0
3_5: 0
40_0: 0
40_10: 0
40_5: 0
41_0: 0
41_10: 0
41_5: 0
42_0: 0
42_10: 0
42_5: 0
43_0: 0
43_10: 0
43_5: 0
44_0: 0
44_10: 0
44_5: 0
45_0: 0
45_10: 0
45_5: 0
46_0: 0
46_10: 0
46_5: 0
47_0: 0
4

In [7]:
pd.DataFrame(X).to_csv(f'fulldata{time//10}{drowsy_ratio}.csv', header=False, index=False)
pd.DataFrame(y).to_csv(f'fulllabel{time//10}{drowsy_ratio}.csv', header=False, index=False)

In [4]:
X = pd.read_csv(f'fulldata{time//10}{drowsy_ratio}.csv', header=None)
y = pd.read_csv(f'fulllabel{time//10}{drowsy_ratio}.csv', header=None)

In [5]:
X, y = np.array(X), np.array(y).ravel()
print(X.shape, y.shape)

(99656, 400) (99656,)


In [6]:
labels, counts = np.unique(y, return_counts=True)
for label, count in zip(labels, counts):
    print(f'{label}: {count}')

0: 66074
1: 33582


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=17/22,
                                                       shuffle=True, random_state=5)
print(X_train.shape, X_test.shape)

(77006, 400) (22650, 400)


In [8]:
def train(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print('Accuracy:', f'{acc*100}%')
    print(confusion_matrix(y_test, y_pred))
    return model

In [12]:
time, drowsy_ratio

(50, 'half')

In [14]:
from sklearn.neighbors import KNeighborsClassifier
acc_list = []

for i in range(1,30):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, y_train) 
    y_pred = neigh.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))

acc_list.index(max(acc_list))+1

28

In [17]:
max(acc_list)

0.6765121412803532

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 28, weights = 'uniform')
model = train(knn, X_train, y_train, X_test, y_test)

filename = f'knn_{time//10}s{drowsy_ratio}.sav'
pickle.dump(model, open(filename, 'wb'))

Accuracy: 67.65121412803532%
[[12819  2055]
 [ 5272  2504]]


In [14]:
from sklearn.svm import SVC
svm = SVC()
model = train(svm, X_train, y_train, X_test, y_test)

filename = f'svm_{time//10}s{drowsy_ratio}.sav'
pickle.dump(model, open(filename, 'wb'))

Accuracy: 74.21633554083886%
[[14333   541]
 [ 5299  2477]]
