In [12]:
# Data manipulation 
import os
import pandas as pd
import glob
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
#pd.set_option('display.max_columns', None)
#pd.set_option("display.max_rows", None)

#MAchine learning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

In [18]:
pip show hyperopt

Name: hyperopt
Version: 0.2.7
Summary: Distributed Asynchronous Hyperparameter Optimization
Home-page: https://hyperopt.github.io/hyperopt
Author: James Bergstra
Author-email: james.bergstra@gmail.com
License: BSD
Location: c:\users\ena\anaconda3\lib\site-packages
Requires: cloudpickle, future, networkx, numpy, py4j, scipy, six, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# Data loading


In [None]:
df2 = pd.read_csv("C:/Users/Ena/Desktop/THESIS/eyetzip_data_with_score.csv", low_memory=False)
df2

In [None]:
df2.columns

In [None]:
# Separate data based on project name
control_group_data = df2[df2['Project name'] == 'Control group experiment']
test_group_data = df2[df2['Project name'] == 'Test group experiment']

In [None]:
# This function was adapted from PRIYANK RAVAL's Kaggle notebook: https://www.kaggle.com/code/priyankraval/eyet-empathyscore-ipynb#Step-3-:-#Load-eyetzip_data_with_score.csv-with-empathy-score-for-data-analysis
# Select relevant columns for control group
control_selected_columns = ['Participant name', 'Recording duration',
                             'Pupil diameter left', 'Pupil diameter right',
                             'Eye position left X (DACSmm)', 'Eye position left Y (DACSmm)', 'Eye position left Z (DACSmm)',
                             'Eye position right X (DACSmm)', 'Eye position right Y (DACSmm)', 'Eye position right Z (DACSmm)',
                             'Gaze event duration', 'Fixation point X', 'Fixation point Y', 'Total Score extended', 'Gaze point X', 'Gaze point Y', 'Gaze event duration']

# Create a DataFrame with selected columns for control group
control_group_selected = control_group_data[control_selected_columns]

# Select relevant columns for test group
test_selected_columns = ['Participant name', 'Recording duration',
                         'Pupil diameter left', 'Pupil diameter right',
                         'Eye position left X (DACSmm)', 'Eye position left Y (DACSmm)', 'Eye position left Z (DACSmm)',
                         'Eye position right X (DACSmm)', 'Eye position right Y (DACSmm)', 'Eye position right Z (DACSmm)',
                         'Gaze event duration', 'Fixation point X', 'Fixation point Y', 'Total Score extended', 'Gaze point X', 'Gaze point Y', 'Gaze event duration']

# Create a DataFrame with selected columns for test group
test_group_selected = test_group_data[test_selected_columns]

In [None]:
not_in_list = sorted(set(df2.columns) - set(control_selected_columns))

print("Columns not in the given list:", list(not_in_list))

In [None]:
print(sorted(control_group_selected.columns))
print(sorted(test_group_selected.columns))

In [None]:
# Replace NaN values with 0 in control group dataframe
control_group_selected = control_group_selected.fillna(0)

# Replace NaN values with 0 in test group dataframe
test_group_selected = test_group_selected.fillna(0)

In [None]:
# This function was adapted from PRIYANK RAVAL's Kaggle notebook: https://www.kaggle.com/code/priyankraval/eyet-empathyscore-ipynb#Step-3-:-#Load-eyetzip_data_with_score.csv-with-empathy-score-for-data-analysis
# Create copies of the DataFrames to avoid the SettingWithCopyWarning
control_group_selected = control_group_selected.copy()
test_group_selected = test_group_selected.copy()

# Calculate Eye_Position_Ratio_X
control_group_selected['Eye_Position_Ratio_X'] = control_group_selected['Eye position left X (DACSmm)'] / (control_group_selected['Eye position right X (DACSmm)'] + 1e-6)
test_group_selected['Eye_Position_Ratio_X'] = test_group_selected['Eye position left X (DACSmm)'] / (test_group_selected['Eye position right X (DACSmm)'] + 1e-6)

# Calculate Eye_Position_Ratio_Y
control_group_selected['Eye_Position_Ratio_Y'] = control_group_selected['Eye position left Y (DACSmm)'] / (control_group_selected['Eye position right Y (DACSmm)'] + 1e-6)
test_group_selected['Eye_Position_Ratio_Y'] = test_group_selected['Eye position left Y (DACSmm)'] / (test_group_selected['Eye position right Y (DACSmm)'] + 1e-6)

# Calculate Eye_Position_Ratio_Z
control_group_selected['Eye_Position_Ratio_Z'] = control_group_selected['Eye position left Z (DACSmm)'] / (control_group_selected['Eye position right Z (DACSmm)'] + 1e-6)
test_group_selected['Eye_Position_Ratio_Z'] = test_group_selected['Eye position left Z (DACSmm)'] / (test_group_selected['Eye position right Z (DACSmm)'] + 1e-6)


In [None]:
# Drop columns used in feature engineering
columns_to_drop = ['Eye position left X (DACSmm)', 'Eye position right X (DACSmm)',
                   'Eye position left Y (DACSmm)', 'Eye position right Y (DACSmm)',
                   'Eye position left Z (DACSmm)', 'Eye position right Z (DACSmm)',
                   'Gaze point left Y', 'Gaze point right X', 'Gaze point right Y',
                   ]

control_group_selected.drop(columns=columns_to_drop, inplace=True)
test_group_selected.drop(columns=columns_to_drop, inplace=True)

In [None]:
# This function was adapted from PRIYANK RAVAL's Kaggle notebook: https://www.kaggle.com/code/priyankraval/eyet-empathyscore-ipynb#Step-3-:-#Load-eyetzip_data_with_score.csv-with-empathy-score-for-data-analysis
# Define the input features (X) and target variable (y) for control group
X_control_group = control_group_selected.drop(columns=['Total Score extended', 'Participant name', 'Recording duration', 'Pupil diameter left', 'Pupil diameter right'])
y_control_group = control_group_selected['Total Score extended']


# Define the input features (X) and target variable (y) for test group
X_t_group= test_group_selected.drop(columns=['Total Score extended', 'Participant name', 'Recording duration', 'Pupil diameter left', 'Pupil diameter right'])
y_t_group = test_group_selected['Total Score extended']

In [None]:
print(X_t_group.columns)
print(X_control_group.columns)

In [None]:
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

X_control_train, X_control_test, y_control_train, y_control_test = train_test_split(X_control_group, y_control_group, test_size=1 - train_ratio)


x_control_val, x_control_test, y_control_val, y_control_test = train_test_split(X_control_test, y_control_test, test_size=test_ratio/(test_ratio + validation_ratio))

print(X_control_train.shape, x_control_val.shape, x_control_test.shape)
print(y_control_train.shape, y_control_val.shape, y_control_test.shape)



train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

X_t_train, X_t_test, y_t_train, y_t_test = train_test_split(X_t_group, y_t_group, test_size=1 - train_ratio)

x_t_val, x_t_test, y_t_val, y_t_test = train_test_split(X_t_test, y_t_test, test_size=test_ratio/(test_ratio + validation_ratio))

print(X_t_train.shape, x_t_val.shape, x_t_test.shape)
print(y_t_train.shape, y_t_val.shape, y_t_test.shape)



## Linear Regression

### Control

In [None]:
# Create a linear regression model
linear_model_control = LinearRegression()

# Train the model on the training set
linear_model_control.fit(X_control_train, y_control_train)

# Make predictions on the testing set
y_control_pred = linear_model_control.predict(x_control_test)

# Evaluate the model
mse = mean_squared_error(y_control_test, y_control_pred)
r2 = r2_score(y_control_test, y_control_pred)  
mae = mean_absolute_error(y_control_test, y_control_pred)
print("control group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))

In [None]:
feature_importance = abs(linear_model_control.coef_)
feature_names = X_control_train.columns.tolist() 
# Iterate over both lists simultaneously
for name, importance in zip(feature_names, feature_importance):
    # Format the output to display feature name and coefficient magnitude rounded to 3 decimals
    print(f'{name}: {importance:.3f}')


In [None]:
plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Coefficient Magnitude')
plt.title('Linear Regression Feature Importance Control Group without Pupil dilation')
plt.savefig("lr_FI_control_wp.jpg", bbox_inches='tight' )
plt.show()

### Test 

In [None]:
# Create a linear regression model
linear_model_test = LinearRegression()

# Train the model on the training set
linear_model_test.fit(X_t_train, y_t_train)

# Make predictions on the testing set
y_pred = linear_model_test.predict(x_t_test)

# Evaluate the model
mse = mean_squared_error(y_t_test, y_pred)
r2 = r2_score(y_t_test, y_pred)  
mae = mean_absolute_error(y_t_test, y_pred)
print("test group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))


In [None]:
feature_importance = abs(linear_model_test.coef_)
feature_names = X_t_train.columns.tolist() 
# Iterate over both lists simultaneously
for name, importance in zip(feature_names, feature_importance):
    # Format the output to display feature name and coefficient magnitude rounded to 3 decimals
    print(f'{name}: {importance:.3f}')


In [None]:
plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Coefficient Magnitude')
plt.title('Linear Regression Feature Importance Control Group without Pupil dilation')
plt.savefig("lr_FI_test_wp.jpg", bbox_inches='tight' )
plt.show()

In [None]:
# This function was adapted from PRIYANK RAVAL's Kaggle notebook: https://www.kaggle.com/code/priyankraval/eyet-empathyscore-ipynb#Step-3-:-#Load-eyetzip_data_with_score.csv-with-empathy-score-for-data-analysis
plt.figure(figsize=(12, 6))

# Plot actual vs predicted for the treatment group
plt.scatter(y_t_test, y_pred, color='blue', label='Test Predictions')
plt.scatter(y_t_test, y_t_test, color="red", label='Test Actual')

# Plot actual vs predicted for the control group
plt.scatter(y_control_test, y_control_pred, color='purple', label='Control Predictions')
plt.scatter(y_control_test, y_control_test, color="green", label='Control Actual')

plt.plot([min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         [min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         color='black', linestyle='--')

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Linear Regression without pupil dilation')
plt.legend()
plt.savefig("score_best_lr_WP.jpg", bbox_inches='tight')
plt.show()


# Decision Tree regressor

### base model & tunning


In [None]:
dt_model_control = DecisionTreeRegressor(max_depth=3, random_state=42).fit(X_control_train, y_control_train) # You can adjust max_depth

# Make predictions on the testing set
y_control_pred = dt_model_control.predict(x_control_test)

# Evaluate the model
mse = mean_squared_error(y_control_test, y_control_pred)
r2 = r2_score(y_control_test, y_control_pred)  
mae = mean_absolute_error(y_control_test, y_control_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)


dt_model_test = DecisionTreeRegressor(max_depth=3, random_state=42) # You can adjust max_depth
dt_model_test.fit(X_t_train, y_t_train)

# Make predictions on the testing set
y_pred = dt_model_test.predict(x_t_test)

# Evaluate the model
mse = mean_squared_error(y_t_test, y_pred)
r2 = r2_score(y_t_test, y_pred)  
mae = mean_absolute_error(y_t_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

### Tuning control

In [None]:
# Define Objective Function
def objective(params):
    dt = DecisionTreeRegressor(**params, random_state=42)
    dt.fit(X_control_train, y_control_train)  # Train the model on the validation set
    y_pred = dt.predict(x_control_val)
    mse = mean_squared_error(y_control_val, y_pred)
    return {'loss': mse, 'status': STATUS_OK}

# Define Search Space
space = {
    'max_depth': hp.choice('max_depth', range(1, 11)),  # Vary max_depth from 1 to 10
    'min_samples_split': hp.choice('min_samples_split', range(2, 11)),  # Vary min_samples_split from 2 to 10
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 11)),  # Vary min_samples_leaf from 1 to 10
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])  # Vary max_feature
}

# Run Hyperparameter Optimization
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=20, trials=trials)

# Print the best hyperparameters
print("Best hyperparameters control group :", best)
#Best hyperparameters control group : {'max_depth': 8, 'max_features': 2, 'min_samples_leaf': 7, 'min_samples_split': 5}

### Tuning Test

In [None]:
def objective(params):
    dt = DecisionTreeRegressor(**params, random_state=42)
    dt.fit(X_t_train, y_t_train)  # Train the model on the validation set
    y_pred = dt.predict(x_t_val)
    mse = mean_squared_error(y_t_val, y_pred)
    return {'loss': mse, 'status': STATUS_OK}

# Define Search Space
space = {
    'max_depth': hp.choice('max_depth', range(1, 11)),  # Vary max_depth from 1 to 10
    'min_samples_split': hp.choice('min_samples_split', range(2, 11)),  # Vary min_samples_split from 2 to 10
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 11)),  # Vary min_samples_leaf from 1 to 10
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])  # Vary max_features
}

# Run Hyperparameter Optimization
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=20, trials=trials)

# Print the best hyperparameters
print("Best hyperparameters test group :", best)
#Best hyperparameters test group : {'max_depth': 9, 'max_features': 2, 'min_samples_leaf': 9, 'min_samples_split': 4}

### Best model control

In [None]:
best_control_dt = DecisionTreeRegressor(max_depth = 8, max_features= None, min_samples_leaf= 7, min_samples_split= 5, random_state=42).fit(X_control_train, y_control_train) 

# Make predictions on the testing set
y_control_pred = best_control_dt.predict(x_control_test)

# Evaluate the model
mse = mean_squared_error(y_control_test, y_control_pred)
r2 = r2_score(y_control_test, y_control_pred)  
mae = mean_absolute_error(y_control_test, y_control_pred)
print("control group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))

#### feature importance 

In [None]:
feature_importance = best_control_dt.feature_importances_
feature_names = X_control_train.columns.tolist()

for name, importance in zip(feature_names, feature_importance):

    print(f'{name}: {importance:.3f}')

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Feature Importance')
plt.title('Decision Tree RegressorFeature Importance Control Group without pupil dilation')
plt.savefig("dt_FI_control_wp.jpg", bbox_inches='tight')

plt.show()

### Best model Test 

In [None]:
best_t_dt = DecisionTreeRegressor(max_depth = 9, max_features= None, min_samples_leaf= 9 ,min_samples_split= 4, random_state=42).fit(X_t_train, y_t_train) 
# Make predictions on the testing set
y_pred = best_t_dt.predict(x_t_test)

# Evaluate the model
mse = mean_squared_error(y_t_test, y_pred)
r2 = r2_score(y_t_test, y_pred)  
mae = mean_absolute_error(y_t_test, y_pred)
print("test group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))

#### Feature importance

In [None]:
feature_importance = best_t_dt.feature_importances_
feature_names = X_t_train.columns.tolist()

for name, importance in zip(feature_names, feature_importance):

    print(f'{name}: {importance:.3f}')

In [None]:
plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Feature Importance')
plt.title('Decision Tree RegressorFeature Importance Test Group without pupil dilation')
plt.savefig("dt_FI_test_wp.jpg", bbox_inches='tight')

plt.show()

#### visual 

In [None]:
# This function was adapted from PRIYANK RAVAL's Kaggle notebook: https://www.kaggle.com/code/priyankraval/eyet-empathyscore-ipynb#Step-3-:-#Load-eyetzip_data_with_score.csv-with-empathy-score-for-data-analysis
plt.figure(figsize=(12, 6))

# Plot actual vs predicted for the treatment group
plt.scatter(y_t_test, y_pred, color='blue', label='Test Predictions')
plt.scatter(y_t_test, y_t_test, color="red", label='Test Actual')

# Plot actual vs predicted for the control group
plt.scatter(y_control_test, y_control_pred, color='purple', label='Control Predictions')
plt.scatter(y_control_test, y_control_test, color="green", label='Control Actual')

plt.plot([min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         [min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         color='black', linestyle='--')

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Decision Tree Regressor without pupil dilation')
plt.legend()
plt.savefig("score_best_dt_WP.jpg", bbox_inches='tight')
plt.show()

# Gradient Boosting Regressor

#### base model 

In [None]:
gb_model_control = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Training and evaluation for Gradient Boosting Regressor
print("Gradient Boosting Regressor - Control Group:")
gb_model_control.fit(X_control_train, y_control_train)

# Make predictions on the testing set
y_control_pred = gb_model_control.predict(x_control_test)

# Evaluate the model
mse = mean_squared_error(y_control_test, y_control_pred)
r2 = r2_score(y_control_test, y_control_pred)  
mae = mean_absolute_error(y_control_test, y_control_pred)
print("control group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))


gb_model_test = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Training and evaluation for Gradient Boosting Regressor
print("Gradient Boosting Regressor - test Group:")
gb_model_test= gb_model_test.fit(X_t_train, y_t_train)

# Make predictions on the testing set
y_pred = gb_model_test.predict(x_t_test)

# Evaluate the model
mse = mean_squared_error(y_t_test, y_pred)
r2 = r2_score(y_t_test, y_pred)  
mae = mean_absolute_error(y_t_test, y_pred)
print("test group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))

### Tuning

In [None]:
# Define Objective Function
def objective(params):
    gb = GradientBoostingRegressor(**params, random_state=42)
    gb.fit(X_control_train, y_control_train)  # Train the model on the validation set
    y_pred = gb.predict(x_control_val)
    mse = mean_squared_error(y_control_val, y_pred)
    return {'loss': mse, 'status': STATUS_OK}

# Define Search Space
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 201, 20)),  # Vary number of trees from 50 to 200
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # Vary learning rate exponentially
    'max_depth': hp.choice('max_depth', range(1, 11)),  # Vary max_depth from 1 to 10
    'min_samples_split': hp.choice('min_samples_split', range(2, 11)),  # Vary min_samples_split from 2 to 10
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 11)),  # Vary min_samples_leaf from 1 to 10
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])  # Vary max_features
}
# Run Hyperparameter Optimization
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=20, trials=trials)

# Print the best hyperparameters
print("Best hyperparameters control group :", best)
#Best hyperparameters control group : {'learning_rate': 0.1930943989046583, 'max_depth': 8, 'max_features': 1, 'min_samples_leaf': 0, 'min_samples_split': 2, 'n_estimators': 5}

In [None]:
def objective(params):
    gbt = GradientBoostingRegressor(**params, random_state=42)
    gbt.fit(X_t_train, y_t_train)  # Train the model on the validation set
    y_pred = gbt.predict(x_t_val)
    mse = mean_squared_error(y_t_val, y_pred)
    return {'loss': mse, 'status': STATUS_OK}

# Define Search Space
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 201, 20)),  # Vary number of trees from 50 to 200
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # Vary learning rate exponentially
    'max_depth': hp.choice('max_depth', range(1, 11)),  # Vary max_depth from 1 to 10
    'min_samples_split': hp.choice('min_samples_split', range(2, 11)),  # Vary min_samples_split from 2 to 10
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 11)),  # Vary min_samples_leaf from 1 to 10
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])  # Vary max_features
}
# Run Hyperparameter Optimization
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=20, trials=trials)

# Print the best hyperparameters
print("Best hyperparameters test group :", best)
#Best hyperparameters test group : {'learning_rate': 0.38566707194790334, 'max_depth': 9, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 0, 'n_estimators': 4}

### Best model

#### Control

In [None]:
gb_model_control = GradientBoostingRegressor(learning_rate= 0.3256474619914467, max_depth= 9, max_features = 'log2', min_samples_leaf = 9, min_samples_split = 6, n_estimators= 90, random_state=42)

# Training and evaluation for Gradient Boosting Regressor
print("Gradient Boosting Regressor - Control Group:")
gb_model_control.fit(X_control_train, y_control_train)

# Make predictions on the testing set
y_control_pred = gb_model_control.predict(x_control_test)

# Evaluate the model
mse = mean_squared_error(y_control_test, y_control_pred)
r2 = r2_score(y_control_test, y_control_pred)  
mae = mean_absolute_error(y_control_test, y_control_pred)
print("control group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))

#### feature importance

In [None]:
feature_importance = gb_model_control.feature_importances_
feature_names = X_control_train.columns.tolist()

for name, importance in zip(feature_names, feature_importance):

    print(f'{name}: {importance:.3f}')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Feature Importance')
plt.title('Gradient Boosting Regressor Feature Importance for control group without pupil dilation ')
plt.savefig("gb_FI_control_wp.jpg", bbox_inches='tight')

plt.show()

### Test

In [None]:
gb_model_test = GradientBoostingRegressor(learning_rate=0.38566707194790334, max_depth= 9, max_features = None , min_samples_leaf = 3, min_samples_split = 2 , n_estimators= 110, random_state=42)

# Training and evaluation for Gradient Boosting Regressor
print("Gradient Boosting Regressor - test Group:")
gb_model_test= gb_model_test.fit(X_t_train, y_t_train)

# Make predictions on the testing set
y_pred = gb_model_test.predict(x_t_test)

# Evaluate the model
mse = mean_squared_error(y_t_test, y_pred)
r2 = r2_score(y_t_test, y_pred)  
mae = mean_absolute_error(y_t_test, y_pred)
print("test group")
print("Mean Squared Error: {:.3f}".format(mse))
print("R-squared: {:.3f}".format(r2))
print("Mean Absolute Error: {:.3f}".format(mae))

#### feature importance 

In [None]:
feature_importance = gb_model_test.feature_importances_
feature_names = X_t_train.columns.tolist()

for name, importance in zip(feature_names, feature_importance):

    print(f'{name}: {importance:.3f}')

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Feature Importance')
plt.title('Gradient Boosting Regressor Feature Importance Test Group without pupil dilation ')
plt.savefig("gb_FI_test_wp.jpg", bbox_inches='tight')

plt.show()

### Visual 

In [None]:
# This function was adapted from PRIYANK RAVAL's Kaggle notebook: https://www.kaggle.com/code/priyankraval/eyet-empathyscore-ipynb#Step-3-:-#Load-eyetzip_data_with_score.csv-with-empathy-score-for-data-analysis
plt.figure(figsize=(12, 6))

# Plot actual vs predicted for the treatment group
plt.scatter(y_t_test, y_pred, color='blue', label='Test Predictions')
plt.scatter(y_t_test, y_t_test, color="red", label='Test Actual')

# Plot actual vs predicted for the control group
plt.scatter(y_control_test, y_control_pred, color='purple', label='Control Predictions')
plt.scatter(y_control_test, y_control_test, color="green", label='Control Actual')

plt.plot([min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         [min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         color='black', linestyle='--')

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.legend()
plt.savefig("score_best_gb_WP.jpg", bbox_inches='tight')
plt.show()

## ADA BOOSTING

### base model 

In [None]:
ab_model_control = AdaBoostRegressor(n_estimators=50, random_state=42).fit(X_control_train, y_control_train) 

# Make predictions on the testing set
y_control_pred = ab_model_control.predict(x_control_test)

# Evaluate the model
mse = mean_squared_error(y_control_test, y_control_pred)
r2 = r2_score(y_control_test, y_control_pred)  
mae = mean_absolute_error(y_control_test, y_control_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)


ab_model_test = AdaBoostRegressor(n_estimators=50, random_state=42) 
ab_model_test.fit(X_t_train, y_t_train)

# Make predictions on the testing set
y_pred = dt_model_test.predict(x_t_test)

# Evaluate the model
mse = mean_squared_error(y_t_test, y_pred)
r2 = r2_score(y_t_test, y_pred)  
mae = mean_absolute_error(y_t_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

### Tuning

In [None]:
# Define Objective Function
def objective(params):
      # Initialize AdaBoost regressor with the remaining parameters
    ab_control = AdaBoostRegressor( **params, random_state=42)
    
    # Train the model on the validation set
    ab_control.fit(X_control_train, y_control_train)
    
    # Make predictions on the validation set
    y_pred = ab_control.predict(x_control_val)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_control_val, y_pred)
    
    return {'loss': mse, 'status': STATUS_OK}

# Define Search Space
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 500, 1)),
    'learning_rate': hp.loguniform('learning_rate', -4, 0),
    'loss': hp.choice('loss', ['linear', 'square', 'exponential']),
    'estimator': hp.choice('estimator', [None, DecisionTreeRegressor(max_depth=5),best_control_dt])
}


# Run Hyperparameter Optimization
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=20, trials=trials)

# Print the best hyperparameters
print("Best hyperparameters control group :", best)
#Best hyperparameters control group : {'estimator': 2, 'learning_rate': 0.0368989361218225, 'loss': 0, 'n_estimators': 233}


In [None]:
# Define Objective Function
def objective(params):
      # Initialize AdaBoost regressor with the remaining parameters
    ab_t = AdaBoostRegressor( **params, random_state=42)
    
    # Train the model on the validation set
    ab_t.fit(X_t_train, y_t_train)
    
    # Make predictions on the validation set
    y_pred = ab_t.predict(x_t_val)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_t_val, y_pred)
    
    return {'loss': mse, 'status': STATUS_OK}

# Define Search Space
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 500, 1)),
    'learning_rate': hp.loguniform('learning_rate', -4, 0),
    'loss': hp.choice('loss', ['linear', 'square', 'exponential']),
    'estimator': hp.choice('estimator', [None, DecisionTreeRegressor(max_depth=5),best_t_dt])
}


# Run Hyperparameter Optimization
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=20, trials=trials)

# Print the best hyperparameters
print("Best hyperparameters test group :", best)
#Best hyperparameters test group : {'estimator': 2, 'learning_rate': 0.02848590919273431, 'loss': 2, 'n_estimators': 61}

### Best models

#### control

In [None]:
ab_model_control = AdaBoostRegressor(estimator = best_control_dt, learning_rate = 0.0368989361218225, n_estimators=233, loss='linear',  random_state=42).fit(X_control_train, y_control_train) 

#Best hyperparameters control group : {'estimator': 2, 'learning_rate': 0.0368989361218225, 'loss': 0, 'n_estimators': 233}
# Make predictions on the testing set
y_control_pred = ab_model_control.predict(x_control_test)

# Evaluate the model
mse = mean_squared_error(y_control_test, y_control_pred)
r2 = r2_score(y_control_test, y_control_pred)  
mae = mean_absolute_error(y_control_test, y_control_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

#### feature importance

In [None]:
feature_importance = ab_model_control.feature_importances_
feature_names = X_control_train.columns.tolist()

for name, importance in zip(feature_names, feature_importance):

    print(f'{name}: {importance:.3f}')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Feature Importance')
plt.title('ADA Boosting Regressor Feature Importance for control group without pupil dilation ')
plt.savefig("ab_FI_control_wp.jpg", bbox_inches='tight')

plt.show()

### Test

In [None]:
ab_model_test = AdaBoostRegressor(estimator= best_t_dt, learning_rate =0.02848590919273431, loss = 'exponential', n_estimators=61,  random_state=42) 
ab_model_test.fit(X_t_train, y_t_train)
#Best hyperparameters test group : {'estimator': 2, 'learning_rate': 0.02848590919273431, 'loss': 2, 'n_estimators': 61}
# Make predictions on the testing set
y_pred = ab_model_test.predict(x_t_test)

# Evaluate the model
mse = mean_squared_error(y_t_test, y_pred)
r2 = r2_score(y_t_test, y_pred)  
mae = mean_absolute_error(y_t_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

#### feature importance 

In [None]:
feature_importance = ab_model_test.feature_importances_
feature_names = X_t_train.columns.tolist()

for name, importance in zip(feature_names, feature_importance):

    print(f'{name}: {importance:.3f}')

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(8, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Feature Importance')
plt.title('ADA Boosting Regressor Feature Importance Test Group without pupil dilation ')
plt.savefig("ab_FI_test_wp.jpg", bbox_inches='tight')

plt.show()

### Visual 

In [None]:
# This function was adapted from PRIYANK RAVAL's Kaggle notebook: https://www.kaggle.com/code/priyankraval/eyet-empathyscore-ipynb#Step-3-:-#Load-eyetzip_data_with_score.csv-with-empathy-score-for-data-analysis
plt.figure(figsize=(12, 6))

# Plot actual vs predicted for the treatment group
plt.scatter(y_t_test, y_pred, color='blue', label='Test Predictions')
plt.scatter(y_t_test, y_t_test, color="red", label='Test Actual')

# Plot actual vs predicted for the control group
plt.scatter(y_control_test, y_control_pred, color='purple', label='Control Predictions')
plt.scatter(y_control_test, y_control_test, color="green", label='Control Actual')

plt.plot([min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         [min(y_t_test.min(), y_control_test.min()), max(y_t_test.max(), y_control_test.max())], 
         color='black', linestyle='--')

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted of ADA Boosting Regressor without pupil dilation')
plt.legend()
plt.savefig("score_best_ab_WP.jpg", bbox_inches='tight')
plt.show()

# references


Raval, P. (2024). Eyetempathyscore.ipynb. Retrieved 11/01/2024,
from http://web.archive.org/web/20080207010024/http://
www.808multimedia.com/winnt/kernel.htm