In [5]:
# setup
import sys
import os
import pandas as pd

sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY
from regression.multiple_linear_regression import run_multiple_regression
from feature_selection.feature_selection_functions import (
    compute_correlation_matrix,
    calculate_vif,
    forward_selection,
    evaluate_on_test_set
)

# parameters
task_name = "cookieTheft"
target = "PhonemicFluencyScore"
output_dir = os.path.join(GIT_DIRECTORY, "results/feature_selection")

# load standardized features and target
model, X_scaled, y, X_train, X_test, y_train, y_test = run_multiple_regression(
    features_path=os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv"),
    scores_path=os.path.join(GIT_DIRECTORY, "resources/language_scores_all_subjects.csv"),
    target=target,
    output_dir=os.path.join(GIT_DIRECTORY, "results/feature_selection"),
    task_name=task_name,
    save_outputs=False
)

In [2]:
# 1. correlation matrix
corr_matrix = compute_correlation_matrix(X_scaled, y, task_name, target, output_dir)


Saved correlation matrix CSV to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/correlation_matrix_cookieTheft_PhonemicFluencyScore.csv
Saved correlation matrix plot to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/full_correlation_matrix_cookieTheft_PhonemicFluencyScore.png


In [3]:
# 2. VIF
vif_df = calculate_vif(X_train)
vif_df.sort_values("VIF", ascending=False)
# Optional: remove high-VIF features manually or log them


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


Unnamed: 0,feature,VIF,VIF_Category
21,VERB,inf,High
19,SCONJ,inf,High
16,PRON,inf,High
15,PART,inf,High
14,NUM,inf,High
...,...,...,...
76,eGeMAPS_logRelF0-H1-H2_sma3nz_stddevNorm,1.228662,Low
64,eGeMAPS_mfcc2_sma3_stddevNorm,1.215338,Low
68,eGeMAPS_mfcc4_sma3_stddevNorm,1.197857,Low
18,PUNCT,,High


In [6]:
# 3. forward selection
selected_features, summary_df, final_model = forward_selection(
    X_train, y_train, task_name, target, output_dir
)


Added: speech_rate | R²_adj: 0.0421
Added: NOUN/VERB | R²_adj: 0.0488
Added: aoa_average | R²_adj: 0.0564
Added: DET | R²_adj: 0.0614
Added: NUM | R²_adj: 0.0661
Added: hesitation_ratio | R²_adj: 0.0690
Added: filler_word_ratio | R²_adj: 0.0716
Added: eGeMAPS_shimmerLocaldB_sma3nz_amean | R²_adj: 0.0729
Added: pause_ratio | R²_adj: 0.0739
Added: INTJ | R²_adj: 0.0745
No improvement. Stopping.


In [7]:
# 4. view summary
summary_df
print(final_model.summary())

                             OLS Regression Results                             
Dep. Variable:     PhonemicFluencyScore   R-squared:                       0.087
Model:                              OLS   Adj. R-squared:                  0.075
Method:                   Least Squares   F-statistic:                     7.154
Date:                  Fri, 02 May 2025   Prob (F-statistic):           7.88e-11
Time:                          22:01:00   Log-Likelihood:                -2228.6
No. Observations:                   765   AIC:                             4479.
Df Residuals:                       754   BIC:                             4530.
Df Model:                            10                                         
Covariance Type:              nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------

In [8]:
# 5. evaluate model

model_final, y_pred_final, test_metrics = evaluate_on_test_set(
    X_train, y_train, X_test, y_test,
    selected_features,
    task_name, target,
    output_dir
)


final test evaluation:
       task               target       R2     RMSE     MAE
cookieTheft PhonemicFluencyScore -0.00223 4.527362 3.62035
saved to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/model_test_evaluation_cookieTheft_PhonemicFluencyScore.csv
prediction plot saved to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/prediction_plot_cookieTheft_PhonemicFluencyScore.png
combined train+test prediction plot saved to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/prediction_plot_train_test_cookieTheft_PhonemicFluencyScore.png
