In [None]:
# to check coefficients, regression summary

# setup
import sys
import os
import pandas as pd
import statsmodels.api as sm

# Add project root to path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import GIT_DIRECTORY
from regression.multiple_regression import run_multiple_regression

# Set task name
task_name = "cookieTheft"

# Run regression and get all relevant variables
model, X_scaled, y, X_train, X_test, y_train, y_test = run_multiple_regression(
    features_path=os.path.join(GIT_DIRECTORY, f"results/features/{task_name}.csv"),
    scores_path=os.path.join(GIT_DIRECTORY, "resources/language_scores_all_subjects.csv"),
    target="PhonemicFluencyScore",
    output_dir=os.path.join(GIT_DIRECTORY, "results/regression"),
    task_name=task_name,
    save_outputs=False
)


def run_ols_summary(X_train, y_train):
    """Fits statsmodels OLS and returns the model and results data frame."""
    X_train_const = sm.add_constant(X_train)
    model_ols = sm.OLS(y_train, X_train_const).fit()
    conf_int = model_ols.conf_int()
    conf_int.columns = ["conf_low", "conf_high"]

    results_df = pd.DataFrame({
        "feature": model_ols.params.index,
        "standardized_coefficient": model_ols.params.values,
        "p_value": model_ols.pvalues.values,
        "std_err": model_ols.bse.values,
        "t_value": model_ols.tvalues.values
    })

    results_df = pd.concat([results_df.set_index("feature"), conf_int], axis=1).reset_index()
    return model_ols, results_df


model_ols, results_df = run_ols_summary(X_train, y_train)
print(model_ols.summary())  # shows regression output
results_df.head()           # preview the results DataFrame
