# test_group_models

In this notebook it'll show the models I decided to use to predict the the test group model.

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.model_selection import cross_validate, train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import keras 
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from sklearn.model_selection import KFold, GroupKFold

In [26]:
# Load features from control  
data = pd.read_csv('test_final_features.csv')

# Scale data for better perfomance
scaler = StandardScaler()

# Extract name of columns in df as a list, remove the participant name and empathy total
columns = list(data.columns[1:-1])

# Scale data by columns 
data[columns] = scaler.fit_transform(data[columns])

In [27]:
data.head()

Unnamed: 0,Participant name,Pupil diameter left,Pupil diameter right,Gaze point X (MCSnorm),Gaze point Y (MCSnorm),Gaze event duration,Total Score extended
0,1,0.047357,0.019464,2.488702,-1.930285,-0.730805,2
1,1,-0.124646,-0.080318,1.362306,-1.191304,-0.75186,2
2,1,0.054948,0.133298,1.098244,-0.462658,-0.782363,2
3,1,-0.036997,0.042475,1.11139,-0.437289,-0.727208,2
4,1,-0.071218,0.040078,1.587277,-0.32309,-0.655639,2


In [36]:
# Code from: https://machinelearningmastery.com/training-validation-test-split-and-cross-validation-done-right/

# Train-test split, intentionally use shuffle=False
X = data.iloc[:,1:-1]
Y = data.iloc[:,-1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, shuffle=False)

In [37]:
# Create two models: Polynomial and linear regression
degree = 2
polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression(fit_intercept=False))
linreg = LinearRegression()

In [38]:
# Cross-validation
scoring = "neg_root_mean_squared_error"
polyscores = cross_validate(polyreg, X_train, Y_train, scoring=scoring, return_estimator=True)
linscores = cross_validate(linreg, X_train, Y_train, scoring=scoring, return_estimator=True)

In [40]:
# Which one is better? Linear and polynomial
print(linscores["test_score"].mean())
print(polyscores["test_score"].mean())
print(linscores["test_score"].mean() - polyscores["test_score"].mean())

-0.9544616824760507
-1.4839683137116728
0.5295066312356221


In [41]:
# Retrain the model and evaluate
linreg = sklearn.base.clone(linreg)
linreg.fit(X_train, Y_train)
print("Test set RMSE:", mean_squared_error(Y_test, linreg.predict(X_test), squared=False))
print("Mean validation RMSE:", -linscores["test_score"].mean())

Test set RMSE: 0.5714827379583589
Mean validation RMSE: 0.9544616824760507


## Logistic Regression

In [44]:
# create a linear regression model
logreg_model = LogisticRegression()

# perform cross-validation with 5 folds
scores = cross_val_score(logreg_model, X, Y, cv=5)

# print the mean and standard deviation of the scores
print("Logistic Regression CV scores: ", scores)
print("Mean CV score: ", scores.mean())
print("Standard deviation of CV scores: ", scores.std())

Linear Regression CV scores:  [0.08333333 0.58333333 0.4375     0.33333333 0.41666667]
Mean CV score:  0.37083333333333335
Standard deviation of CV scores:  0.16478099944404326


## Support Vector Regression

In [45]:
# create a support vector regression model with a radial basis function kernel
svr_model = SVR(kernel='rbf')

# perform cross-validation with 5 folds
scores = cross_val_score(svr_model, X, Y, cv=5)

# print the mean and standard deviation of the scores
print("Support Vector Regression CV scores: ", scores)
print("Mean CV score: ", scores.mean())
print("Standard deviation of CV scores: ", scores.std())

Support Vector Regression CV scores:  [-0.4554346  -1.1001104  -2.33294648 -1.58259894 -0.88732524]
Mean CV score:  -1.2716831296313538
Standard deviation of CV scores:  0.6428652095225007


## Neural Network

In [47]:
# create a sequential model
model = keras.Sequential([
    keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu'),
    keras.layers.Dense(1, activation='linear')
])

# compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# fit the model on training data
model.fit(X_train, Y_train, epochs=5, batch_size=32)

# predict on test data
Y_pred = model.predict(X_test)

# Print MSE, MAE, and R-squared score
print("MSE: ", mean_squared_error(Y_test, Y_pred))
print("MAE: ", mean_absolute_error(Y_test, Y_pred))
print("R-squared: ", r2_score(Y_test, Y_pred))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
MSE:  1.9699799094223973
MAE:  1.2708776437211782
R-squared:  -6.879919637689589
