In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd /content/drive/MyDrive/ML-Stage2

/content/drive/MyDrive/ML-Stage2


In [7]:
data_folder = 'original_output'
label_file = 'KSS.txt'
dataframes = {}

labels = pd.read_csv(label_file, header=None, delimiter=' ', dtype=int)


# Set the frame rate and interval size
fps = 30
interval_size = 1

# Compute the number of frames per interval
frames_per_interval = fps * interval_size

# Read the label file
labels = pd.read_csv(label_file, header=None, delimiter=' ', dtype=int)

# Loop over the data files
for filename in os.listdir(data_folder):
    if filename.endswith('.csv'):
        # Extract the row and column indices from the filename
        row, col = map(int, filename[:-4].split('-'))
        label = labels.iloc[row - 1, col - 1]

        # Read the data file
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)

        # Add a new column with the label
        df['label'] = label

        # Truncate the dataframe to remove any extra rows
        n_rows = (df.shape[0] // frames_per_interval) * frames_per_interval
        df = df.iloc[:n_rows]

        # Reshape the dataframe into a 3D array
        data = df.values.reshape(-1, frames_per_interval, df.shape[1])

        # Compute the mean along the second axis
        data = np.mean(data, axis=1)

        # Create a new dataframe from the resampled data
        columns = ['frame', 'leftEye', 'rightEye', 'lip_distance', 'label']
        df = pd.DataFrame(data, columns=columns)

        print(df)

        # Store the dataframe in the dictionary
        dataframes[filename] = df

      frame   leftEye  rightEye  lip_distance  label
0      14.5  0.380603  0.362096      1.260948    6.0
1      44.5  0.378687  0.355564      2.288343    6.0
2      74.5  0.359324  0.347770      2.374536    6.0
3     104.5  0.381799  0.376432      2.170770    6.0
4     134.5  0.385096  0.350641      1.407869    6.0
..      ...       ...       ...           ...    ...
309  9355.5  0.347620  0.346602      0.841421    6.0
310  9385.5  0.304315  0.289676      0.827614    6.0
311  9415.5  0.313063  0.304822      0.874755    6.0
312  9445.5  0.312918  0.297892      0.880474    6.0
313  9475.5  0.352305  0.326767      0.533333    6.0

[314 rows x 5 columns]
           frame   leftEye  rightEye  lip_distance  label
0      14.500000  0.370665  0.361489      0.594281    7.0
1      44.500000  0.377559  0.339604      1.418107    7.0
2      74.500000  0.292781  0.289808      0.680474    7.0
3     104.500000  0.266832  0.275194      1.108088    7.0
4     134.500000  0.272250  0.279805      1.141421

In [8]:
# Convert into binary

threshold = 6

for df in dataframes.values():
    df['label'] = (df['label'] >= threshold).astype(int)
    print(df)


      frame   leftEye  rightEye  lip_distance  label
0      14.5  0.380603  0.362096      1.260948      1
1      44.5  0.378687  0.355564      2.288343      1
2      74.5  0.359324  0.347770      2.374536      1
3     104.5  0.381799  0.376432      2.170770      1
4     134.5  0.385096  0.350641      1.407869      1
..      ...       ...       ...           ...    ...
309  9355.5  0.347620  0.346602      0.841421      1
310  9385.5  0.304315  0.289676      0.827614      1
311  9415.5  0.313063  0.304822      0.874755      1
312  9445.5  0.312918  0.297892      0.880474      1
313  9475.5  0.352305  0.326767      0.533333      1

[314 rows x 5 columns]
           frame   leftEye  rightEye  lip_distance  label
0      14.500000  0.370665  0.361489      0.594281      1
1      44.500000  0.377559  0.339604      1.418107      1
2      74.500000  0.292781  0.289808      0.680474      1
3     104.500000  0.266832  0.275194      1.108088      1
4     134.500000  0.272250  0.279805      1.141421

In [9]:
# Set the number of dataframes to use for training and validation
n_train_val = 27

# Split the dataframes into training+validation and testing sets
train_dfs = list(dataframes.values())[:n_train_val]
test_dfs = list(dataframes.values())[n_train_val:]

# Set the proportion of the training+validation data to use for validation
#val_size = 0.2

# Split the training+validation dataframes into training and validation sets
#train_dfs, val_dfs = train_test_split(train_val_dfs, test_size=val_size)

In [10]:
# Set the number of dataframes to use for training and validation
#n_train = 27
#n_validation = 2

# Split the dataframes into training, validation, and testing sets
#all_dfs = list(dataframes.values())
#train_dfs, remaining_dfs = train_test_split(all_dfs, test_size=n_validation, random_state=42)
#validation_dfs, test_dfs = train_test_split(remaining_dfs, test_size=n_validation, random_state=42)

#print("Number of training dataframes:", len(train_dfs))
#print("Number of validation dataframes:", len(validation_dfs))
#print("Number of testing dataframes:", len(test_dfs))

In [11]:
def extract_features_and_labels(dfs, window_size):
    X = []
    y = []
    for df in dfs:
        for i in range(len(df) - window_size + 1):
            window = df.iloc[i:i+window_size]
            features = window[['leftEye', 'rightEye', 'lip_distance']].values.flatten()
            label = window['label'].iloc[-1]
            X.append(features)
            y.append(label)
    return X, y

# Set the window size
window_size = 50

# Extract the features and labels from the training and testing dataframes
X_train, y_train = extract_features_and_labels(train_dfs, window_size)
X_test, y_test = extract_features_and_labels(test_dfs, window_size)


#X_train, y_train = extract_features_and_labels(train_dfs, window_size)
#X_val, y_val = extract_features_and_labels(val_dfs, window_size)
#X_test, y_test = extract_features_and_labels(test_dfs, window_size)

# Convert the features and labels to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
#X_val = np.array(X_val)
#y_val = np.array(y_val)
X_test = np.array(X_test)
y_test = np.array(y_test)

print(X_train.shape)
print(y_train.shape)
#print(X_val.shape)
#print(y_val.shape)
print(X_test.shape)
print(y_test.shape)


(9707, 150)
(9707,)
(4893, 150)
(4893,)


In [13]:
!pip install scikeras
!pip install keras

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import GridSearchCV
from keras.metrics import Precision, Accuracy

# Function to create model, required for KerasClassifier
def create_model(optimizer='adam'):
    # Initialize the model
    model = Sequential()
    # Add layers to the model
    model.add(Dense(64, activation='relu', input_shape=(150,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(optimizer=optimizer, loss='huber', metrics=['accuracy'])
    return model

# Create the KerasClassifier wrapper
model = KerasClassifier(build_fn=create_model, verbose=0)

# Define the parameter grid
param_grid = {
    'epochs': [5, 10, 100],
    'optimizer': ['adam', 'rmsprop'],
}

# Initialize the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, return_train_score=True, scoring='accuracy')

# Fit the grid search object on the training data (it will train several models)
grid_search.fit(X_train, y_train)

# Print the best parameters found by grid search
print(f'Best parameters: {grid_search.best_params_}')
print(f'Validation Accuracy: {grid_search.best_score_:.2f}')
print(f'Training Accuracy: {grid_search.cv_results_["mean_train_score"][grid_search.best_index_]:.2f}')
# Use the best estimator to make predictions on the testing data
y_pred = grid_search.predict(X_test)

# Convert continuous outputs to binary labels
y_pred = (y_pred > 0.5).astype(int)

# Compute the accuracy of the predictions
accuracy = (y_pred == y_test).mean()

# Print the result
print(f'Neural Network: {accuracy:.2f}')


  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Initialize the logistic regression model
clf = LogisticRegression()

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50, 100, 200],
    'fit_intercept': [True, False]
}

# Initialize the grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5, return_train_score=True, scoring='accuracy')

# Fit the grid search object on the training data (it will train several models)
grid_search.fit(X_train, y_train)

# Print the best parameters found by grid search
print(f'Best parameters: {grid_search.best_params_}')

print(f'Training Accuracy: {grid_search.cv_results_["mean_train_score"][grid_search.best_index_]:.2f}')

# Print the validation accuracy of the best estimator
print(f'Validation Accuracy: {grid_search.best_score_:.2f}')

# Use the best estimator to make predictions on the testing data
y_test_pred = grid_search.predict(X_test)

# Compute the accuracy of the predictions on the testing data
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')





'from sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import GridSearchCV\n\n# Initialize the logistic regression model\nclf = LogisticRegression()\n\n# Define the parameter grid\nparam_grid = {\n    \'C\': [0.001, 0.01, 0.1, 1, 10, 100],\n    \'penalty\': [\'l2\'],\n    \'solver\': [\'newton-cg\', \'lbfgs\', \'liblinear\', \'sag\', \'saga\'],\n    \'max_iter\': [50, 100, 200],\n    \'fit_intercept\': [True, False]\n}\n\n# Initialize the grid search with cross-validation\ngrid_search = GridSearchCV(clf, param_grid, cv=5, return_train_score=True, scoring=\'accuracy\')\n\n# Fit the grid search object on the training data (it will train several models)\ngrid_search.fit(X_train, y_train)\n\n# Print the best parameters found by grid search\nprint(f\'Best parameters: {grid_search.best_params_}\')\n\n# Use the best estimator to make predictions on the validation data\n# y_val_pred = grid_search.predict(X_val)\n\n# Comput

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_test_pred)

# Print the confusion matrix
print('Confusion Matrix:')
print(cm)

# Visualize the confusion matrix using seaborn
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()