## Stroke Work
<br>Author: Daniel Maina Nderitu<br>
Project: MADIVA<br>
Purpose: Diagnostics
<br>Notes:   

#### Bootstrap cell

In [3]:
# =================== BOOTSTRAP CELL ===================
# Standard setup for all notebooks
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

from src.config.variables import COVARIATES

# ========================================================
# Optional for warnings and nicer plots
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

import sys
from pathlib import Path

# ========================================================
# 1️⃣ Ensure project root is in Python path
# Adjust this if your notebooks are nested deeper
PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
# 2️⃣ Import helper to load paths
from src.utils.helpers import load_paths

# ========================================================
# 3️⃣ Load paths from config.yaml (works regardless of notebook location)
paths = load_paths()

# ========================================================
# 4️⃣ Optionally, print paths to confirm
for key, value in paths.items():
    print(f"{key}: {value}")

# ========================================================
# 5️⃣ Now you can use these paths in your notebook:
# Example:
DATA_DIR = paths['DATA_DIR']
OUT_DIR = paths['OUT_DIR']
FIG_DIR = paths['FIG_DIR']

# ========================================================

BASE_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work
DATA_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\data
OUT_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\model_output
FIG_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\visualization


### Import data - from previous step

In [None]:
# -----------------------------------------------------------------------------
# Loading saved data as pickle:
# -----------------------------------------------------------------------------
df = pd.read_pickle(OUT_DIR / "df_step06_processed.pkl")
X = pd.read_pickle(OUT_DIR / "X_step06_model_matrix.pkl")
y = pd.read_pickle(OUT_DIR / "y_step06_event.pkl")

#### Correlation Matrix
Pearson correlation - gives pairwise collinearity

In [58]:
# -----------------------------------------------------------------------------
# Checking correlation matrix
# -----------------------------------------------------------------------------
correlation_matrix = X.corr()

# -----------------------------------------------------------------------------
# Calculating Variance Inflation Factor (VIF) for multicollinearity
# -----------------------------------------------------------------------------
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# -----------------------------------------------------------------------------
# Checking diabetes-hypertension interaction
# -----------------------------------------------------------------------------
# Add interaction term
X['hpt_diab_interaction'] = X['hpt_status_derived'] * X['diab_status_derived']

#### VIF
VIF - gives multivariate collinearity<br>
VIF = 1 → no multicollinearity<br>
VIF     2–5 → mild correlation<br>
VIF > 5 (or 10) → serious multicollinearity

In [59]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ============================================================
# Add constant for VIF calculation (temporary)
# -----------------------------------------------------------------------------
X_for_vif = X.copy() if 'const' not in X.columns else X.drop('const', axis=1)
X_for_vif = sm.add_constant(X_for_vif)

# ============================================================
# Calculate Variance Inflation Factor (VIF)
# -----------------------------------------------------------------------------
vif_data = pd.DataFrame()                  # Creating an empty dataframe for results
vif_data["Variable"] = X_for_vif.columns   # 

# ============================================================
# The core loop
# -----------------------------------------------------------------------------
vif_data["VIF"] = [variance_inflation_factor(X_for_vif.values, i) 
                   for i in range(X_for_vif.shape[1])]

# ============================================================
# Sorting by highest VIF (Most problematic at the top)
# Which predictor is most collinear (removal or transformation)
# -----------------------------------------------------------------------------
vif_data = vif_data.sort_values("VIF", ascending=False)   

# ============================================================
# Pearson Correlation matrix
# -----------------------------------------------------------------------------
corr_matrix = X.drop('const', axis=1, errors='ignore').corr()

# ============================================================
print("="*60)
print("VIF ANALYSIS")
print("="*60)
print(vif_data.to_string())
print("\n")

# ============================================================
print("="*60)
print("CORRELATION MATRIX (Top Correlations > 0.3)")
print("="*60)

# ============================================================
# Display only strong correlations
# -----------------------------------------------------------------------------
corr_pairs = corr_matrix.unstack()
strong_corr = corr_pairs[(abs(corr_pairs) > 0.3) & (corr_pairs < 1)]
if not strong_corr.empty:
    print(strong_corr.sort_values(ascending=False))
else:
    print("No correlations > 0.3")

VIF ANALYSIS
                        Variable        VIF
9           hpt_diab_interaction 10431.9866
4             hpt_status_derived 10409.6072
0                          const     7.8254
5            diab_status_derived     3.1639
3                    tobacco_use     1.5055
7             hiv_status_derived     1.4873
2                    alcohol_use     1.1595
1                     sex_binary     1.1064
6  bmi_category_Overweight_Obese     1.0860
8                   site_Nairobi     1.0757


CORRELATION MATRIX (Top Correlations > 0.3)
hpt_status_derived    hpt_diab_interaction   0.9999
hpt_diab_interaction  hpt_status_derived     0.9999
diab_status_derived   hpt_diab_interaction   0.8244
hpt_diab_interaction  diab_status_derived    0.8244
hpt_status_derived    diab_status_derived    0.8240
diab_status_derived   hpt_status_derived     0.8240
tobacco_use           hiv_status_derived     0.5548
hiv_status_derived    tobacco_use            0.5548
dtype: float64


#### End

In [42]:
# Saved as pickle (faster for large data, preserves types)
df.to_pickle(OUT_DIR / "df_step07_processed.pkl")
X.to_pickle(OUT_DIR / "X_step07_model_matrix.pkl")
y.to_pickle(OUT_DIR / "y_step07_event.pkl")