## Stroke Work
<br>Author: Daniel Maina Nderitu<br>
Project: MADIVA<br>
Purpose: Incidence modeling<br>
Notes:   We are comparing Poisson, robust Poisson, and NB models.

#### Bootstrap cell

In [26]:
# =================== BOOTSTRAP CELL ===================
# Standard setup for all notebooks
# ========================================================
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# # ========================================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config.variables import COVARIATES
# Import helper to load paths
from src.utils.helpers import load_paths


# 3Ô∏è‚É£ Load paths from config.yaml (works regardless of notebook location)
paths = load_paths()



import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

# ========================================================
# Optional for warnings and nicer plots
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

# ========================================================
# 1Ô∏è‚É£ Ensure project root is in Python path
# Adjust this if your notebooks are nested deeper
# ========================================================

# ========================================================

# ========================================================
# 4Ô∏è‚É£ Optionally, print paths to confirm
for key, value in paths.items():
    print(f"{key}: {value}")

# ========================================================
# Using paths in your notebook:
DATA_DIR = paths['DATA_DIR']
OUT_DIR = paths['OUT_DIR']
FIG_DIR = paths['FIG_DIR']
MODEL_DIR = paths["MODEL_DIR"]

# ========================================================

BASE_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work
DATA_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\data
OUT_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\model_output
FIG_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\visualization
NOTEBOOKS_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\notebooks
NOTEBOOKS_EXECUTED_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\notebooks_executed


KeyError: 'MODEL_DIR'

### Import data - from previous step

In [None]:
# -----------------------------------------------------------------------------
# Loading saved data as pickle:
# -----------------------------------------------------------------------------
df = pd.read_pickle(OUT_DIR / "df_step06_processed.pkl")
X = pd.read_pickle(OUT_DIR / "X_step06_model_matrix.pkl")
y = pd.read_pickle(OUT_DIR / "y_step06_event.pkl")

#### prepare_pooled_data()

In [None]:
# =================================================================================  
# Prepare X, y, offset for pooled models
# =================================================================================  

def prepare_pooled_data(df, covariates, event_col='event', offset_col='offset'):
    """
    Prepare data for pooled regression models, handling 'Missing' string values.
    """
    
    # Select only covariates that exist in df (defensive)
    covariates_present = [c for c in covariates if c in df.columns]
    missing_covariates = set(covariates) - set(covariates_present)
    
    if missing_covariates:
        print(f"‚ö†Ô∏è  Warning: Missing covariates: {missing_covariates}")
    print("‚úÖ Covariates used:", covariates_present)
    
    # Create working copies
    X_pooled = df[covariates_present].copy()
    y = df[event_col].copy()
    offset = df[offset_col].copy()
    
    # --- First: Convert 'Missing' strings to NaN ---
    print("üîÑ Converting 'Missing' strings to NaN...")
    
    for col in X_pooled.columns:
        # Check if column contains string 'Missing'
        if X_pooled[col].dtype == 'object':
            missing_count = (X_pooled[col] == 'Missing').sum()
            if missing_count > 0:
                print(f"   {col}: {missing_count} 'Missing' values ‚Üí NaN")
                X_pooled[col] = X_pooled[col].replace('Missing', np.nan)
        
        # Also check for other missing representations
        other_missing_representations = ['missing', 'MISSING', 'Unknown', 'unknown', '']
        if X_pooled[col].dtype == 'object':
            for missing_val in other_missing_representations:
                if missing_val in X_pooled[col].values:
                    count = (X_pooled[col] == missing_val).sum()
                    if count > 0:
                        print(f"   {col}: {count} '{missing_val}' values ‚Üí NaN")
                        X_pooled[col] = X_pooled[col].replace(missing_val, np.nan)
    
    # --- Now handle data types properly ---
    print("üîÑ Converting data types...")
    
    for col in X_pooled.columns:
        if X_pooled[col].dtype == 'bool':
            # Convert pure boolean to int
            X_pooled[col] = X_pooled[col].astype(int)
            print(f"   {col}: bool ‚Üí int")
            
        elif X_pooled[col].dtype == 'object':
            # Try to convert to numeric, which will handle the NaN values properly
            original_dtype = X_pooled[col].dtype
            X_pooled[col] = pd.to_numeric(X_pooled[col], errors='coerce')
            converted_nans = X_pooled[col].isna().sum()
            print(f"   {col}: {original_dtype} ‚Üí numeric ({converted_nans} NaN values)")
            
        else:
            # Ensure numeric type
            original_dtype = X_pooled[col].dtype
            X_pooled[col] = pd.to_numeric(X_pooled[col], errors='coerce')
            if X_pooled[col].dtype != original_dtype:
                print(f"   {col}: {original_dtype} ‚Üí {X_pooled[col].dtype}")
    
    # --- Check missingness percentage after conversion ---
    check_columns = covariates_present + [event_col, offset_col]
    missing_pct = (
        pd.concat([X_pooled, y.rename('event'), offset.rename('offset')], axis=1)
        .isna()
        .mean()
        .sort_values(ascending=False) * 100
    )
    
    print("\nüîé Final percentage of missing values by variable:")
    for var, pct in missing_pct[missing_pct > 0].items():
        print(f"   {var}: {pct:.2f}%")
    
    if missing_pct.max() == 0:
        print("   No missing values found.")
    
    # --- Drop only rows where outcome or offset are missing or invalid ---
    valid_mask = (
        y.notna() & 
        np.isfinite(offset) & 
        (y >= 0)  # Assuming events should be non-negative
    )
    
    n_initial = len(df)
    n_final = valid_mask.sum()
    n_dropped = n_initial - n_final
    
    if n_dropped > 0:
        print(f"üìä Dropped {n_dropped} rows ({n_dropped/n_initial*100:.1f}%) due to missing/invalid outcomes or offsets")
    
    # Apply the mask
    X_pooled = X_pooled.loc[valid_mask]
    y_pooled = y.loc[valid_mask]
    offset_pooled = offset.loc[valid_mask]
    
    # --- Add constant term ---
    X_pooled_const = sm.add_constant(X_pooled, has_constant='add')
    
    print(f"\n‚úÖ Final pooled dataset: {X_pooled_const.shape[0]} rows, {X_pooled_const.shape[1]} columns")
    print(f"   Features: {X_pooled_const.shape[1]-1} covariates + constant")
    
    return X_pooled_const, y_pooled, offset_pooled, covariates_present

# =================================================================================  
# Usage
# =================================================================================  
covariates_present = [c for c in COVARIATES if c in df.columns]
print("Covariates used:", covariates_present)

X_pooled_const, y_pooled, offset_pooled, covariates_used = prepare_pooled_data(df, covariates_present)

# =================================================================================  
# Verify the result
# =================================================================================  
print("\nüîç Sample of processed X_pooled:")
print(X_pooled_const.head())
print(f"\nData types:\n{X_pooled_const.dtypes}")

#### Poisson

In [None]:
df[COVARIATES + ['event', 'offset']].isna().sum()
# np.isinf(df[covariates + ['offset']]).sum()   # Check for infinity

In [None]:
df_clean = df[COVARIATES + ['event', 'offset']].dropna() # dropping rows with missingness

In [None]:
import statsmodels.api as sm
import numpy as np

# =================================================================================  
X = sm.add_constant(df_clean[COVARIATES])
y = df_clean['event']

# =================================================================================  
# =================================================================================  
model_pois = sm.GLM(
    y,
    X,
    family=sm.families.Poisson(),
    offset=df_clean['offset']
).fit()

# =================================================================================  
# =================================================================================  
print(model_pois.summary())
print("\nIRR:")
print(np.exp(model_pois.params))

##### Poisson Model Results

In [None]:
import numpy as np
import pandas as pd

# Print full model summary
print(model_pois.summary())

# Create a tidy summary DataFrame
results_df = pd.DataFrame({
    "Variable": model_pois.params.index,
    "Coef": model_pois.params.values,
    "StdErr": model_pois.bse,
    "z": model_pois.tvalues,
    "P>|z|": model_pois.pvalues,
    "CI_lower": model_pois.conf_int()[0],
    "CI_upper": model_pois.conf_int()[1]
})

# Add Incidence Rate Ratios (IRR)
results_df["IRR"] = np.exp(results_df["Coef"])
results_df["IRR_CI_lower"] = np.exp(results_df["CI_lower"])
results_df["IRR_CI_upper"] = np.exp(results_df["CI_upper"])

# print(results_df)
results_df.to_csv(OUT_DIR / "poisson_model_results_main.csv", index=False)

#### Robust Poisson

In [None]:
# Fit Robust Poisson (same coefficients, larger SEs)
model_robust = sm.GLM(y, X, family=sm.families.Poisson(), offset=df_clean["offset"]).fit(cov_type='HC0')

#### Negative Binomial

In [None]:
model_nb = sm.GLM(y, X, family=sm.families.NegativeBinomial(), offset=df_clean["offset"]).fit()

#### Model comparison

In [None]:
summary = summary_col(
    results=[model_pois, model_robust, model_nb],
    model_names=['Poisson', 'Robust Poisson', 'NegBinomial'],
    stars=True,
    float_format='%0.3f',
    info_dict={'N':lambda x: f"{int(x.nobs)}"}
)
print(summary)

#### IRRs

In [None]:
def extract_irrs(model):
    df_irr = pd.DataFrame({
        "Variable": model.params.index,
        "Coef": model.params.values,
        "StdErr": model.bse,
        "z": model.tvalues,
        "P>|z|": model.pvalues,
        "CI_lower": model.conf_int()[0],
        "CI_upper": model.conf_int()[1]
    })
    df_irr["IRR"] = np.exp(df_irr["Coef"])
    df_irr["IRR_CI_lower"] = np.exp(df_irr["CI_lower"])
    df_irr["IRR_CI_upper"] = np.exp(df_irr["CI_upper"])
    # Add significance stars
    df_irr["sig"] = df_irr["P>|z|"].apply(lambda p: 
                                  "***" if p < 0.001 else 
                                  "**" if p < 0.01 else 
                                  "*" if p < 0.05 else "")
    return df_irr

# =================================================================================  
# =================================================================================  

results_pois = extract_irrs(model_pois)
results_robust = extract_irrs(model_robust)
results_nb = extract_irrs(model_nb)

# =================================================================================  
# --- 4Ô∏è‚É£ Combine results for export ---
# =================================================================================  
all_results = pd.concat([
    results_pois.assign(Model='Poisson'),
    results_robust.assign(Model='Robust Poisson'),
    results_nb.assign(Model='NegBinomial')
])

# =================================================================================  
# Save table to Excel
# =================================================================================  
all_results.to_excel(OUT_DIR / "stroke_model_results_comparison_main.xlsx", index=False)
print("‚úÖ Model comparison results saved to Excel.")

#### Overdispersion

In [None]:
dispersion = model_pois.deviance / model_pois.df_resid
print("Dispersion parameter:", dispersion)

if dispersion > 1.5:
    print("‚ö†Ô∏è Data likely overdispersed ‚Äî Negative Binomial model may be more appropriate.")
else:
    print("‚úÖ Poisson model dispersion acceptable.")

#### End - Saving Models and Data

In [None]:
# Saved as pickle (faster for large data, preserves types)
df.to_pickle(OUT_DIR / "df_step07_processed.pkl")
X.to_pickle(OUT_DIR / "X_step07_model_matrix.pkl")
y.to_pickle(OUT_DIR / "y_step07_event.pkl")

# =================================================================================  
# Saving models
# =================================================================================  
import pickle

with open(MODEL_DIR / "model_pois.pkl", "wb") as f:
    pickle.dump(model_pois, f)

with open(MODEL_DIR / "model_robust.pkl", "wb") as f:
    pickle.dump(model_robust, f)

with open(MODEL_DIR / "model_nb.pkl", "wb") as f:
    pickle.dump(model_nb, f)