# control_group_models

In this notebook it'll show the models I decided to use to predict the the control group model.

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.model_selection import cross_validate, train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import keras 
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from sklearn.model_selection import KFold, GroupKFold

In [41]:
# Load features from control  
data = pd.read_csv('control_final_features.csv')

# Scale data for better perfomance
scaler = StandardScaler()

# Extract name of columns in df as a list, remove the participant name and empathy total
columns = list(data.columns[1:-1])

# Scale data by columns 
data[columns] = scaler.fit_transform(data[columns])

In [42]:
data.head()

Unnamed: 0,Participant name,Pupil diameter left,Pupil diameter right,Gaze point X (MCSnorm),Gaze point Y (MCSnorm),Total Score extended
0,2,-0.818307,-0.771083,-0.680542,-0.576123,3
1,2,-0.920335,-1.064625,0.593515,0.172209,3
2,2,-0.604592,-0.631356,0.572094,0.016587,3
3,2,-0.576383,-0.453767,-0.528207,0.032114,3
4,4,-1.175007,-1.012498,0.880017,0.009025,2


In [43]:
# Code from: https://machinelearningmastery.com/training-validation-test-split-and-cross-validation-done-right/

# Train-test split, intentionally use shuffle=False
X = data.iloc[:,1:-1]
Y = data.iloc[:,-1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.50, shuffle=False)

In [44]:
# Create two models: Polynomial and linear regression
degree = 2
polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression(fit_intercept=False))
linreg = LinearRegression()

In [45]:
# Cross-validation
scoring = "neg_root_mean_squared_error"
polyscores = cross_validate(polyreg, X_train, Y_train, scoring=scoring, return_estimator=True)
linscores = cross_validate(linreg, X_train, Y_train, scoring=scoring, return_estimator=True)

In [46]:
# Retrain the model and evaluate
linreg = sklearn.base.clone(linreg)
linreg.fit(X_train, Y_train)
print("Test set RMSE:", mean_squared_error(Y_test, linreg.predict(X_test), squared=False))
print("Mean validation RMSE:", -linscores["test_score"].mean())

Test set RMSE: 1.4294543428318127
Mean validation RMSE: 0.6262526293083622


## Logistic Regression

In [47]:
# create a linear regression model
logreg_model = LogisticRegression()

# perform cross-validation with 5 folds
scores = cross_val_score(logreg_model, X, Y, cv=5)

# print the mean and standard deviation of the scores
print("Logistic Regression CV scores: ", scores)
print("Mean CV score: ", scores.mean())
print("Standard deviation of CV scores: ", scores.std())

Linear Regression CV scores:  [0.45833333 0.58333333 0.375      0.17391304 0.34782609]
Mean CV score:  0.3876811594202899
Standard deviation of CV scores:  0.13469784618491754


## Support Vector Regression

In [48]:
# create a support vector regression model with a radial basis function kernel
svr_model = SVR(kernel='rbf')

# perform cross-validation with 5 folds
scores = cross_val_score(svr_model, X, Y, cv=5)

# print the mean and standard deviation of the scores
print("Support Vector Regression CV scores: ", scores)
print("Mean CV score: ", scores.mean())
print("Standard deviation of CV scores: ", scores.std())

Support Vector Regression CV scores:  [-0.28255721 -5.77269188 -0.29291182 -5.25200644 -0.45120256]
Mean CV score:  -2.410273982025548
Standard deviation of CV scores:  2.538883986284045


## Neural Network

In [39]:
# create a sequential model
model = keras.Sequential([
    keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu'),
    keras.layers.Dense(1, activation='linear')
])

# compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# fit the model on training data
model.fit(X_train, Y_train, epochs=10, batch_size=32)

# predict on test data
Y_pred = model.predict(X_test)

# Print MSE, MAE, and R-squared score
print("MSE: ", mean_squared_error(Y_test, Y_pred))
print("MAE: ", mean_absolute_error(Y_test, Y_pred))
print("R-squared: ", r2_score(Y_test, Y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
MSE:  0.6052273646846217
MAE:  0.6584987087901366
R-squared:  -1.033587313192247
