In [8]:
# setup
import sys
import os
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Add project root to path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY
from regression.train_regression_models import train_and_evaluate_regression_model

# Set task name
task_name = "cookieTheft"

# Run regression and get all relevant variables
model, X_scaled, y, X_train, X_test, y_train, y_test = run_multiple_regression(
    features_path=os.path.join(GIT_DIRECTORY, f"results/features/{task_name}.csv"),
    scores_path=os.path.join(GIT_DIRECTORY, "resources/language_scores_all_subjects.csv"),
    target="PhonemicFluencyScore",
    output_dir=os.path.join(GIT_DIRECTORY, "results/regression"),
    task_name=task_name,
    save_outputs=False
)


def calculate_vif(X_train):
    """Calculates Variance Inflation Factor (VIF) for standardized features."""
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_train.columns
    vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
    return vif_data


In [9]:
calculate_vif(X_train)

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


Unnamed: 0,feature,VIF
0,n_words,82.616738
1,ttr,6.390907
2,mattr,3.447529
3,filler_word_ratio,11.098140
4,concreteness_score,3.752399
...,...,...
122,eGeMAPS_MeanVoicedSegmentLengthSec,35.235714
123,eGeMAPS_StddevVoicedSegmentLengthSec,38.766628
124,eGeMAPS_MeanUnvoicedSegmentLength,28.838103
125,eGeMAPS_StddevUnvoicedSegmentLength,10.474527


In [10]:
vif_df = calculate_vif(X_train)

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


In [11]:
# low multicollinearity
vif_df[vif_df["VIF"] < 5]

Unnamed: 0,feature,VIF
2,mattr,3.447529
4,concreteness_score,3.752399
5,aoa_average,2.655671
28,CCONJ/SCONJ,3.812062
36,avg_pause_duration,2.909618
62,eGeMAPS_mfcc1_sma3_stddevNorm,1.585749
64,eGeMAPS_mfcc2_sma3_stddevNorm,1.215338
66,eGeMAPS_mfcc3_sma3_stddevNorm,1.39522
68,eGeMAPS_mfcc4_sma3_stddevNorm,1.197857
70,eGeMAPS_jitterLocal_sma3nz_stddevNorm,3.600845


In [12]:
# high multicollinearity
vif_df[vif_df["VIF"] > 10]


Unnamed: 0,feature,VIF
0,n_words,82.616738
3,filler_word_ratio,11.098140
6,ADJ,inf
7,ADP,inf
8,ADV,inf
...,...,...
121,eGeMAPS_VoicedSegmentsPerSec,14.700245
122,eGeMAPS_MeanVoicedSegmentLengthSec,35.235714
123,eGeMAPS_StddevVoicedSegmentLengthSec,38.766628
124,eGeMAPS_MeanUnvoicedSegmentLength,28.838103


In [13]:
# group VIF into categories

def categorize_vif(vif_value):
    if vif_value < 5:
        return "Low"
    elif vif_value < 10:
        return "Moderate"
    else:
        return "High"

vif_df["VIF_Category"] = vif_df["VIF"].apply(categorize_vif)
vif_df.sort_values("VIF", ascending=False)



Unnamed: 0,feature,VIF,VIF_Category
21,VERB,inf,High
19,SCONJ,inf,High
16,PRON,inf,High
15,PART,inf,High
14,NUM,inf,High
...,...,...,...
76,eGeMAPS_logRelF0-H1-H2_sma3nz_stddevNorm,1.228662,Low
64,eGeMAPS_mfcc2_sma3_stddevNorm,1.215338,Low
68,eGeMAPS_mfcc4_sma3_stddevNorm,1.197857,Low
18,PUNCT,,High
