### Packages &#x1F4DA;

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_theme()
import pickle
import shap
from prettytable import PrettyTable
import matplotlib.pyplot as plt


In [None]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import  DecisionTreeRegressor, plot_tree, export_text
from sklearn.ensemble import RandomForestRegressor as RF

In [8]:
df = pd.read_csv(r'C:\Users\karol\OneDrive\Área de Trabalho\Mestrado\Códigos\base_completa.csv', sep=';')

In [11]:
import random

random.seed(42)
ind_train_split = int(0.7*len(np.unique(df['file'])))
ind_train_split_list = random.sample(list(np.unique(df['file'])), ind_train_split)
ind_test_split_list = list(set(list(np.unique(df['file']))) - set(ind_train_split_list))

bool_train_split_list = []
bool_test_split_list = []
for file in df['file'].values:
    if file in set(ind_train_split_list):
        bool_train_split_list.append(True)
        bool_test_split_list.append(False)
    else:
        bool_train_split_list.append(False)
        bool_test_split_list.append(True)

X_train = df[bool_train_split_list].drop(columns=['frameTime', 'Arousal', 'Valence'])
X_test = df[bool_test_split_list].drop(columns=['frameTime', 'Arousal', 'Valence'])

y_train_arousal = df[bool_train_split_list]['Arousal']
y_test_arousal = df[bool_test_split_list]['Arousal']

y_train_valence = df[bool_train_split_list]['Valence']
y_test_valence = df[bool_test_split_list]['Valence']

### Models and metrics &#x1F52E;	

#### Linear Regression &#x1F4CF;	

In [None]:
LR_a = Pipeline(steps = [('Scaler', StandardScaler()),
                ('LinearRegression', LinearRegression())])

LR_v = Pipeline(steps = [('Scaler', StandardScaler()),
                ('LinearRegression', LinearRegression())])

LR_a.fit(X_train, y_train_arousal)
LR_v.fit(X_train, y_train_valence)

tabela_LR = PrettyTable()
tabela_LR.field_names = ["Métrica", "Arousal", "Valence"]

tabela_LR.add_row(["MSE", round(mean_squared_error(y_test_arousal, LR_a.predict(X_test)), 4),
                        round(mean_squared_error(y_test_valence, LR_v.predict(X_test)), 4)])

tabela_LR.add_row(["MAE", round(mean_absolute_error(y_test_arousal, LR_a.predict(X_test)), 4),
                        round(mean_absolute_error(y_test_valence, LR_v.predict(X_test)), 4)])

tabela_LR.add_row(["R²", round(r2_score(y_test_arousal, LR_a.predict(X_test)), 4),
                        round(r2_score(y_test_valence, LR_v.predict(X_test)), 4)])

print(tabela_LR)

#### Ridge Regression &#x1F535;	

In [None]:
alphas = np.logspace(-4, 4, 100)

Ridge_a = Pipeline([
    ('Scaler', StandardScaler()),
    ('RidgeCV', RidgeCV(alphas=alphas, store_cv_values=True))
])

Ridge_v = Pipeline([
    ('Scaler', StandardScaler()),
    ('RidgeCV', RidgeCV(alphas=alphas, store_cv_values=True))
])

Ridge_a.fit(X_train, y_train_arousal)
Ridge_v.fit(X_train, y_train_valence)

tabela_Ridge = PrettyTable()
tabela_Ridge.field_names = ["Métrica", "Arousal", "Valence"]

tabela_Ridge.add_row(["MSE", round(mean_squared_error(y_test_arousal, Ridge_a.predict(X_test)), 4),
                        round(mean_squared_error(y_test_valence, Ridge_v.predict(X_test)), 4)])

tabela_Ridge.add_row(["MAE", round(mean_absolute_error(y_test_arousal, Ridge_a.predict(X_test)), 4),
                        round(mean_absolute_error(y_test_valence, Ridge_v.predict(X_test)), 4)])

tabela_Ridge.add_row(["R²", round(r2_score(y_test_arousal, Ridge_a.predict(X_test)), 4),
                        round(r2_score(y_test_valence, Ridge_v.predict(X_test)), 4)])

print(tabela_Ridge)
print("Melhor alpha para Arousal:", Ridge_a.named_steps['RidgeCV'].alpha_)
print("Melhor alpha para Valence:", Ridge_v.named_steps['RidgeCV'].alpha_)

#### Lasso Regression &#x1F537;	

In [None]:
alphas = np.logspace(-100, -75, 100)

Lasso_a = Pipeline([
    ('Scaler', StandardScaler()),
    ('LassoCV', LassoCV(alphas=alphas, cv=5, max_iter=10000))
])

Lasso_v = Pipeline([
    ('Scaler', StandardScaler()),
    ('LassoCV', LassoCV(alphas=alphas, cv=5, max_iter=10000))
])

Lasso_a.fit(X_train, y_train_arousal)
Lasso_v.fit(X_train, y_train_valence)

tabela_Lasso = PrettyTable()
tabela_Lasso.field_names = ["Métrica", "Arousal", "Valence"]

tabela_Lasso.add_row(["MSE", round(mean_squared_error(y_test_arousal, Lasso_a.predict(X_test)), 4),
                        round(mean_squared_error(y_test_valence, Lasso_v.predict(X_test)), 4)])

tabela_Lasso.add_row(["MAE", round(mean_absolute_error(y_test_arousal, Lasso_a.predict(X_test)), 4),
                        round(mean_absolute_error(y_test_valence, Lasso_v.predict(X_test)), 4)])

tabela_Lasso.add_row(["R²", round(r2_score(y_test_arousal, Lasso_a.predict(X_test)), 4),
                        round(r2_score(y_test_valence, Lasso_v.predict(X_test)), 4)])

# Exibir resultados
print(tabela_Lasso)
print("Melhor alpha para Arousal:", Lasso_a.named_steps['LassoCV'].alpha_)
print("Melhor alpha para Valence:", Lasso_v.named_steps['LassoCV'].alpha_)

#### Tree Regression &#x1F332;	

In [None]:
Tree_a = Pipeline([
    ('Scaler', StandardScaler()),
    ('Tree', DecisionTreeRegressor(random_state=42, max_depth=10))
])

Tree_v = Pipeline([
    ('Scaler', StandardScaler()),
    ('Tree', DecisionTreeRegressor(random_state=42, max_depth=10))
])

Tree_a.fit(X_train, y_train_arousal)
Tree_v.fit(X_train, y_train_valence)

tabela_Tree = PrettyTable()
tabela_Tree.field_names = ["Métrica", "Arousal", "Valence"]

tabela_Tree.add_row(["MSE", round(mean_squared_error(y_test_arousal, Tree_a.predict(X_test)), 4),
                                round(mean_squared_error(y_test_valence, Tree_v.predict(X_test)), 4)])

tabela_Tree.add_row(["MAE", round(mean_absolute_error(y_test_arousal, Tree_a.predict(X_test)), 4),
                                round(mean_absolute_error(y_test_valence, Tree_v.predict(X_test)), 4)])

tabela_Tree.add_row(["R²", round(r2_score(y_test_arousal, Tree_a.predict(X_test)), 4),
                                round(r2_score(y_test_valence, Tree_v.predict(X_test)), 4)])

print(tabela_Tree)

#### Random Forest &#x1F332;&#x1F332;&#x1F332;	

##### Arousal

In [None]:
RF_a_full = Pipeline(steps = [('Scaler', StandardScaler()),
    ('rf_a', RF(n_estimators = 1000, min_samples_leaf = 5, n_jobs = -1, verbose = 5, max_features=0.8, random_state=42))])

RF_a_full.fit(X_train, y_train_arousal)

In [None]:
importances_a = RF_a_full.best_estimator_['rf_a'].feature_importances_
top10_idx_a = np.argsort(importances_a)[-10:]
X_train_a_top10 = X_train.iloc[:, top10_idx_a]
X_test_a_top10 = X_test.iloc[:, top10_idx_a]

RF_a_top10 = Pipeline(steps = [('Scaler', StandardScaler()),
    ('rf_a10', RF(n_estimators = 1000, min_samples_leaf = 5, n_jobs = -1, verbose = 5, max_features=0.8, random_state=42))])


RF_a_top10.fit(X_train_a_top10, y_train_arousal)

In [None]:
tabela_rf_a = PrettyTable()
tabela_rf_a.field_names = ["Métrica", "Arousal (full)", "Arousal (top10)"]

tabela_rf_a.add_row([
    "MSE",
    round(mean_squared_error(y_test_arousal, RF_a_full.predict(X_test)), 4),
    round(mean_squared_error(y_test_arousal, RF_a_top10(X_test_a_top10)), 4)
])

tabela_rf_a.add_row([
    "MAE",
    round(mean_absolute_error(y_test_arousal, RF_a_full.predict(X_test)), 4),
    round(mean_absolute_error(y_test_arousal, RF_a_top10.predict(X_test_a_top10)), 4)
])

tabela_rf_a.add_row([
    "R²",
    round(r2_score(y_test_arousal, RF_a_full.predict(X_test)), 4),
    round(r2_score(y_test_arousal, RF_a_top10.predict(X_test_a_top10)), 4)
])

print(tabela_rf_a)

##### Valence

In [None]:
RF_v_full = Pipeline(steps = [('Scaler', StandardScaler()),
    ('rf_v', RF(n_estimators = 1000, min_samples_leaf = 5, n_jobs = -1, verbose = 5, max_features=0.8, random_state=42))])

RF_v_full.fit(X_train, y_train_valence)

In [None]:
importances_v = RF_v_full.best_estimator_['rf_v'].feature_importances_
top10_idx_v = np.argsort(importances_v)[-10:]
X_train_v_top10 = X_train.iloc[:, top10_idx_v]
X_test_v_top10 = X_test.iloc[:, top10_idx_v]

RF_v_top10 = Pipeline(steps = [('Scaler', StandardScaler()),
    ('rf_v10', RF(n_estimators = 1000, min_samples_leaf = 5, n_jobs = -1, verbose = 5, max_features=0.8, random_state=42))])


RF_v_top10.fit(X_train_v_top10, y_train_valence)

In [None]:
tabela_rf_v = PrettyTable()
tabela_rf_v.field_names = ["Métrica", "Valence (full)", "Valence (top10)"]

tabela_rf_v.add_row([
    "MSE",
    round(mean_squared_error(y_test_valence, RF_v_full.predict(X_test)), 4),
    round(mean_squared_error(y_test_valence, RF_v_top10(X_test_v_top10)), 4)
])

tabela_rf_v.add_row([
    "MAE",
    round(mean_absolute_error(y_test_valence, RF_v_full.predict(X_test)), 4),
    round(mean_absolute_error(y_test_valence, RF_v_top10.predict(X_test_v_top10)), 4)
])

tabela_rf_v.add_row([
    "R²",
    round(r2_score(y_test_valence, RF_v_full.predict(X_test)), 4),
    round(r2_score(y_test_valence, RF_v_top10.predict(X_test_v_top10)), 4)
])

print(tabela_rf_v)

##### Shapley - Arousal

In [None]:
shap.initjs()

In [None]:
# ====== FULL MODEL - AROUSAL ======
explainer_a_full = shap.TreeExplainer(RF_a_full.best_estimator_['rf_a'])
shap_values_a_full = explainer_a_full.shap_values(X_test)
shap.summary_plot(shap_values_a_full, X_test)

In [None]:
# ====== TOP 10 - AROUSAL ======
explainer_a_top10 = shap.TreeExplainer(RF_a_top10.best_estimator_['rf_a10'])
shap_values_a_top10 = explainer_a_top10.shap_values(X_test_a_top10)
shap.summary_plot(shap_values_a_top10, X_test_a_top10)

##### Shapley - Valence

In [None]:
shap.initjs()

In [None]:
# ====== FULL MODEL - VALENCE ======
explainer_v_full = shap.TreeExplainer(RF_v_full.best_estimator_['rf_v'])
shap_values_v_full = explainer_v_full.shap_values(X_test)
shap.summary_plot(shap_values_v_full, X_test)

In [None]:
# ====== TOP 10 - VALENCE ======
explainer_v_top10 = shap.TreeExplainer(RF_v_top10.best_estimator_['rf_v10'])
shap_values_v_top10 = explainer_v_top10.shap_values(X_test_v_top10)
shap.summary_plot(shap_values_v_top10, X_test_v_top10)