In [1]:
# setup
import sys
import os
import pandas as pd

sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY
from regression.train_regression_models import train_and_evaluate_regression_model
from feature_selection.feature_selection_functions import (
    compute_correlation_matrix,
    calculate_vif,
    forward_selection,
    evaluate_on_test_set
)

# parameters
task_name = "cookieTheft"
target = "PhonemicFluencyScore"
output_dir = os.path.join(GIT_DIRECTORY, "results/feature_selection")

# load standardized features and target
model, X_scaled, y, X_train, X_test, y_train, y_test = run_multiple_regression(
    features_path=os.path.join(GIT_DIRECTORY, f"results/features/{task_name}.csv"),
    scores_path=os.path.join(GIT_DIRECTORY, "resources/language_scores_all_subjects.csv"),
    target=target,
    output_dir=os.path.join(GIT_DIRECTORY, "results/feature_selection"),
    task_name=task_name,
    save_outputs=False
)

In [2]:
# 1. correlation matrix
corr_matrix = compute_correlation_matrix(X_scaled, y, task_name, target, output_dir)


Saved correlation matrix CSV to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/correlation_matrix_cookieTheft_PhonemicFluencyScore.csv
Saved correlation matrix plot to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/full_correlation_matrix_cookieTheft_PhonemicFluencyScore.png


In [3]:
# 2. VIF
vif_df = calculate_vif(X_train)
vif_df.sort_values("VIF", ascending=False)
# Optional: remove high-VIF features manually or log them


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


Unnamed: 0,feature,VIF,VIF_Category
21,VERB,inf,High
19,SCONJ,inf,High
16,PRON,inf,High
15,PART,inf,High
14,NUM,inf,High
...,...,...,...
76,eGeMAPS_logRelF0-H1-H2_sma3nz_stddevNorm,1.228662,Low
64,eGeMAPS_mfcc2_sma3_stddevNorm,1.215338,Low
68,eGeMAPS_mfcc4_sma3_stddevNorm,1.197857,Low
18,PUNCT,,High


In [4]:
# 3. forward selection
selected_features, summary_df, final_model = forward_selection(
    X_train, y_train, task_name, target, output_dir
)


Added: speech_rate | R²_adj: 0.0320
Added: filler_word_ratio | R²_adj: 0.0441
Added: VERB | R²_adj: 0.0560
Added: eGeMAPS_StddevUnvoicedSegmentLength | R²_adj: 0.0643
Added: eGeMAPS_loudness_sma3_pctlrange0-2 | R²_adj: 0.0745
Added: eGeMAPS_loudness_sma3_meanRisingSlope | R²_adj: 0.0833
Added: eGeMAPS_F2bandwidth_sma3nz_stddevNorm | R²_adj: 0.0919
Added: eGeMAPS_HNRdBACF_sma3nz_stddevNorm | R²_adj: 0.1010
Added: eGeMAPS_F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope | R²_adj: 0.1059
Added: eGeMAPS_loudness_sma3_meanFallingSlope | R²_adj: 0.1121
Added: eGeMAPS_F1frequency_sma3nz_stddevNorm | R²_adj: 0.1191
Added: OPEN/CLOSED | R²_adj: 0.1216
Added: eGeMAPS_slopeV500-1500_sma3nz_amean | R²_adj: 0.1236
Added: eGeMAPS_mfcc2V_sma3nz_amean | R²_adj: 0.1276
Added: eGeMAPS_shimmerLocaldB_sma3nz_stddevNorm | R²_adj: 0.1295
Added: eGeMAPS_mfcc3_sma3_amean | R²_adj: 0.1310
Added: eGeMAPS_slopeV0-500_sma3nz_amean | R²_adj: 0.1347
Added: eGeMAPS_slopeUV0-500_sma3nz_amean | R²_adj: 0.1382
Added: eGe

In [5]:
# 4. view summary
summary_df
print(final_model.summary())

                             OLS Regression Results                             
Dep. Variable:     PhonemicFluencyScore   R-squared:                       0.220
Model:                              OLS   Adj. R-squared:                  0.166
Method:                   Least Squares   F-statistic:                     4.090
Date:                  Thu, 24 Apr 2025   Prob (F-statistic):           4.08e-14
Time:                          16:50:49   Log-Likelihood:                -1701.8
No. Observations:                   607   AIC:                             3484.
Df Residuals:                       567   BIC:                             3660.
Df Model:                            39                                         
Covariance Type:              nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

In [6]:
# 5. evaluate model

model_final, y_pred_final, test_metrics = evaluate_on_test_set(
    X_train, y_train, X_test, y_test,
    selected_features,
    task_name, target,
    output_dir
)


final test evaluation:
       task               target        R2    RMSE      MAE
cookieTheft PhonemicFluencyScore -0.151444 4.75913 3.799522
saved to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/model_test_evaluation_cookieTheft_PhonemicFluencyScore.csv
prediction plot saved to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/prediction_plot_cookieTheft_PhonemicFluencyScore.png
combined train+test prediction plot saved to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_selection/prediction_plot_train_test_cookieTheft_PhonemicFluencyScore.png
