## Stroke Work
Author: Daniel Maina Nderitu<br>
Project: MADIVA<br>
Purpose: Prevalence analysis<br>
Notes:   Prevalence ≠ Incidence

#### Bootstrap cell

In [3]:
# =================== BOOTSTRAP CELL ===================
# Standard setup for all notebooks
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from src.config.variables import COVARIATES

# ========================================================
# Optional for warnings and nicer plots
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

import sys
from pathlib import Path

# ========================================================
# 1️⃣ Ensure project root is in Python path
# Adjust this if your notebooks are nested deeper
PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
# 2️⃣ Import helper to load paths
from src.utils.helpers import load_paths

# ========================================================
# 3️⃣ Load paths from config.yaml (works regardless of notebook location)
paths = load_paths()

# ========================================================
# 4️⃣ Optionally, print paths to confirm
for key, value in paths.items():
    print(f"{key}: {value}")

# ========================================================
# 5️⃣ Now you can use these paths in your notebook:
# Example:
DATA_DIR = paths['DATA_DIR']
OUT_DIR = paths['OUT_DIR']
FIG_DIR = paths['FIG_DIR']

# ========================================================

BASE_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work
DATA_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\data
OUT_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\model_output
FIG_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\visualization


### Import data - from previous step

In [None]:
# data saved as pickle:
df = pd.read_pickle(OUT_DIR / "df_step05_processed.pkl")

#### Logistic Regression

In [27]:
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
X = df[COVARIATES]
y = df['stroke_status_derived']


# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
non_numeric = X.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", list(non_numeric))

# -----------------------------------------------------------------------------
# Convert boolean columns to integers
# -----------------------------------------------------------------------------
bool_cols = X.select_dtypes(include=['bool']).columns
X[bool_cols] = X[bool_cols].astype(int)

# -----------------------------------------------------------------------------
# Convert to numeric (if any column is still object type)
# -----------------------------------------------------------------------------
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# -----------------------------------------------------------------------------
# Drop rows with missing values that result from coercion
# -----------------------------------------------------------------------------
X = X.dropna()
y = y.loc[X.index]   # Using indices ensures X and y stay aligned

# -----------------------------------------------------------------------------
# add constant term for intercept
# -----------------------------------------------------------------------------
X = sm.add_constant(X) 

# -----------------------------------------------------------------------------
# Logistic Regression
# -----------------------------------------------------------------------------
model = sm.Logit(y, X).fit()  # model = sm.OLS(y, X).fit() # OLS - Ordinary Least Squares

# -----------------------------------------------------------------------------
# The standard summary
# -----------------------------------------------------------------------------
print(model.summary())

Non-numeric columns: []
Optimization terminated successfully.
         Current function value: 0.109847
         Iterations 12
                             Logit Regression Results                            
Dep. Variable:     stroke_status_derived   No. Observations:                28650
Model:                             Logit   Df Residuals:                    28641
Method:                              MLE   Df Model:                            8
Date:                   Tue, 27 Jan 2026   Pseudo R-squ.:                 0.01100
Time:                           11:48:04   Log-Likelihood:                -3147.1
converged:                          True   LL-Null:                       -3182.1
Covariance Type:               nonrobust   LLR p-value:                 4.954e-12
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                

#### ORs + CIs

In [28]:
# =================================================================================
print("\n" + "="*80)
print("ODDS RATIOS (exp(coef)) WITH 95% CONFIDENCE INTERVALS")
print("="*80)
# =================================================================================

# Calculating Odds Ratios (OR) and confidence intervals (CI)
import numpy as np

# Get coefficients and confidence intervals
coef = model.params
conf = model.conf_int()  # 95% CI by default
conf.columns = ['CI_lower', 'CI_upper']

# Calculating ORs
odds_ratios = np.exp(coef)
or_ci_lower = np.exp(conf['CI_lower'])
or_ci_upper = np.exp(conf['CI_upper'])

# Creating a clean DataFrame for presentation
results_df = pd.DataFrame({
    'Coef (log-odds)': coef,
    'Odds Ratio': odds_ratios,
    'OR 95% CI Lower': or_ci_lower,
    'OR 95% CI Upper': or_ci_upper,
    'p-value': model.pvalues
})

# Formatting for better readability
pd.set_option('display.float_format', lambda x: f'{x:.4f}')
print(results_df[['Odds Ratio', 'OR 95% CI Lower', 'OR 95% CI Upper', 'p-value']])


ODDS RATIOS (exp(coef)) WITH 95% CONFIDENCE INTERVALS
                               Odds Ratio  OR 95% CI Lower  OR 95% CI Upper  p-value
const                              0.0226           0.0185           0.0276   0.0000
sex_binary                         1.0206           0.8687           1.1991   0.8042
alcohol_use                        0.9998           0.9996           1.0000   0.0373
tobacco_use                        0.9986           0.9977           0.9995   0.0027
hpt_status_derived                 1.2204           1.0456           1.4244   0.0116
diab_status_derived                0.8183           0.7012           0.9551   0.0110
bmi_category_Overweight_Obese      1.1275           0.9623           1.3212   0.1378
hiv_status_derived                 0.9997           0.9994           1.0001   0.1074
site_Nairobi                       1.3022           1.0804           1.5697   0.0056


#### Marginal effects

In [29]:
# =================================================================================
# Getting change in probability
# =================================================================================  
print("\n" + "="*80)
print("Average Marginal Effects (AME) - change in probability")
print("="*80)
# Average Marginal Effects (AME) - change in probability
margeff = model.get_margeff()
print(margeff.summary())

# This gives us the average change in probability
# e.g., hypertension increases stroke probability by X%


Average Marginal Effects (AME) - change in probability
         Logit Marginal Effects         
Dep. Variable:     stroke_status_derived
Method:                             dydx
At:                              overall
                                   dy/dx    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
sex_binary                        0.0005      0.002      0.248      0.804      -0.003       0.004
alcohol_use                   -4.702e-06   2.26e-06     -2.077      0.038   -9.14e-06   -2.66e-07
tobacco_use                   -3.246e-05   1.09e-05     -2.977      0.003   -5.38e-05   -1.11e-05
hpt_status_derived                0.0045      0.002      2.515      0.012       0.001       0.008
diab_status_derived              -0.0046      0.002     -2.532      0.011      -0.008      -0.001
bmi_category_Overweight_Obese     0.0027      0.002      1.482      0.138      -0.001       0.

#### Interpretation text

In [30]:
# =================================================================================
# Creating a more clinical presentation
# =================================================================================
print("\n" + "="*80)
print("CLINICALLY RELEVANT INTERPRETATION")
print("="*80)

# Filter for variables of interest (excluding constant)
vars_of_interest = [col for col in results_df.index if col != 'const']

for var in vars_of_interest:
    or_val = results_df.loc[var, 'Odds Ratio']
    ci_low = results_df.loc[var, 'OR 95% CI Lower']
    ci_high = results_df.loc[var, 'OR 95% CI Upper']
    p_val = results_df.loc[var, 'p-value']
    
    direction = "increases" if or_val > 1 else "decreases"
    sig_star = " **" if p_val < 0.05 else ""
    
    print(f"{var:{30}} OR = {or_val:.3f} ({ci_low:.3f}-{ci_high:.3f}){sig_star}")
    if p_val < 0.05:
        print(f"  → {direction} odds of stroke by {abs(or_val-1)*100:.1f}%")


CLINICALLY RELEVANT INTERPRETATION
sex_binary                     OR = 1.021 (0.869-1.199)
alcohol_use                    OR = 1.000 (1.000-1.000) **
  → decreases odds of stroke by 0.0%
tobacco_use                    OR = 0.999 (0.998-1.000) **
  → decreases odds of stroke by 0.1%
hpt_status_derived             OR = 1.220 (1.046-1.424) **
  → increases odds of stroke by 22.0%
diab_status_derived            OR = 0.818 (0.701-0.955) **
  → decreases odds of stroke by 18.2%
bmi_category_Overweight_Obese  OR = 1.128 (0.962-1.321)
hiv_status_derived             OR = 1.000 (0.999-1.000)
site_Nairobi                   OR = 1.302 (1.080-1.570) **
  → increases odds of stroke by 30.2%


#### End

In [42]:
# Saved as pickle (faster for large data, preserves types)
df.to_pickle(OUT_DIR / "df_step06_processed.pkl")
X.to_pickle(OUT_DIR / "X_step06_model_matrix.pkl")
y.to_pickle(OUT_DIR / "y_step06_event.pkl")  # y = df.loc[X.index, "stroke_status_derived"]       # y = df.loc[X.index, "event"]