## Stroke Work
<br>Author: Daniel Maina Nderitu<br>
Project: MADIVA
Purpose: Make analysis-ready covariates

#### Bootstrap cell

In [1]:
# =================== BOOTSTRAP CELL ===================
# Standard setup for all notebooks
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config.variables import COVARIATES
from src.config.variables import KEY_PREDICTORS

# ========================================================
# Optional for warnings and nicer plots
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

import sys
from pathlib import Path

# ========================================================
# 1️⃣ Ensure project root is in Python path
# Adjust this if your notebooks are nested deeper
PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
# 2️⃣ Import helper to load paths
from src.utils.helpers import load_paths

# ========================================================
# 3️⃣ Load paths from config.yaml (works regardless of notebook location)
paths = load_paths()

# ========================================================
# 4️⃣ Optionally, print paths to confirm
for key, value in paths.items():
    print(f"{key}: {value}")

# ========================================================
# 5️⃣ Now you can use these paths in your notebook:
# Example:
DATA_DIR = paths['DATA_DIR']
OUT_DIR = paths['OUT_DIR']
FIG_DIR = paths['FIG_DIR']

# ========================================================

BASE_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work
DATA_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\data
OUT_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\model_output
FIG_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\visualization
MODEL_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\model_output\statsmodels
NOTEBOOKS_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\notebooks
NOTEBOOKS_EXECUTED_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\notebooks_executed


### Import data - from previous step

In [2]:
# data saved as pickle:
df = pd.read_pickle(OUT_DIR / "df_step02_processed.pkl")

### Data preprocessing 

#### Projects Name Map

In [3]:
name_map = {
    "SCALEUP Population Baseline": "SCALEUP_Pop_Baseline",
    "Assess Linkages Main": "Assess_Linkages_Main",
    "HAALSI_1": "HAALSI_1",
    "HIV_NCD": "HIV_NCD",
    "AWIGEN_1": "AWIGEN_1",
    "HAALSI_2": "HAALSI_2",
    "Nkateko_1": "Nkateko_1",
    "HAALSI_3": "HAALSI_3",
    "Nkateko_2": "Nkateko_2",
    "ARKStudyPhase_1": "ARK_1",
    "ARKStudyPhase_2": "ARK_2",
    "AWIGEN_2": "AWIGEN_2",
    "SCALEUP Clinic Baseline": "SCALEUP_Clinic_Baseline",
    "Diabetics Baseline": "Diabetics_Baseline",
    "Diabetics Followup": "Diabetics_Followup"
}

# Rename
df['source'] = df['source'].replace(name_map)

# Make it categorical
df['source'] = df['source'].astype('category')

In [4]:
df['source'].head()

0    Nkateko_1
1        ARK_1
2        ARK_2
3      HIV_NCD
4    Nkateko_1
Name: source, dtype: category
Categories (15, object): ['ARK_1', 'ARK_2', 'AWIGEN_1', 'AWIGEN_2', ..., 'Nkateko_1', 'Nkateko_2', 'SCALEUP_Clinic_Baseline', 'SCALEUP_Pop_Baseline']

#### Record type, Gender, BMI processing, and type conversion

In [5]:
# ------------------------------------------------------------------------------------
# Drop individuals with only one record
# ------------------------------------------------------------------------------------
df = df.loc[df['record_type']==2].copy()
print(df.shape)
print(df.sex.value_counts())
df['sex_binary'] = df['sex'].replace({1: 0, 2: 1})  # 0 = male, 1 = female
df['bmi_refined'] = pd.to_numeric(df['bmi_refined'], errors='coerce')
# ------------------------------------------------------------------------------------
# # BMI based on WHO categories
# ------------------------------------------------------------------------------------
# df['bmi_category'] = pd.cut(df['bmi_refined'], 
#                             bins=[0, 18.5, 24.9, 29.9, np.inf], 
#                             labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
# Creating only two categories (making it dichotomous)
df['bmi_category'] = pd.cut(df['bmi_refined'], 
                            bins=[0, 24.9, np.inf], 
                            labels=['Normal_Underweight', 'Overweight_Obese'])

df = pd.get_dummies(df, columns=['bmi_category'], drop_first=True)

(30146, 339)
sex
2    16391
1    13755
Name: count, dtype: int64


#### Site & Source/study dummies (ONE-HOT ENCODING)

In [6]:
# ------------------------------------------------------------------------------------
# Making sure 'hdss_name' exists and is a string
# ------------------------------------------------------------------------------------
df['hdss_name'] = df['hdss_name'].astype(str)

# ------------------------------------------------------------------------------------
# # Study
# ------------------------------------------------------------------------------------
# df['source'] = df['source'].astype('category')
# df = pd.get_dummies(df, columns=['source'], drop_first=True)
 
# One-hot encode site (drop one to avoid multicollinearity) - Agincourt reference
site_dummies = pd.get_dummies(df['hdss_name'], prefix='site', drop_first=True)
print(site_dummies.columns.tolist())

# If both created, drop 'site_Agincourt' to make it reference
if 'site_Nairobi' in site_dummies.columns:
    if 'site_Agincourt' not in site_dummies.columns:
        # make sure Nairobi present; handle gracefully if not
        pass
    site_cols = [c for c in site_dummies.columns if c != 'site_Nairobi']
else:
    site_cols = [c for c in site_dummies.columns]  # in case naming differs

# Merge dummy columns into df
df = pd.concat([df, site_dummies], axis=1)
# df['site_Nairobi'] = df['site_Nairobi'].astype('category')
# df['site_Agincourt'] = df['site_Agincourt'].astype('category')


print("Site dummy variables created:", list(site_dummies.columns))

['site_Nairobi']
Site dummy variables created: ['site_Nairobi']


#### Boolean conversion

In [7]:
# ------------------------------------------------------------------------------------
# Convert all boolean columns to integers (0/1)
# ------------------------------------------------------------------------------------
df = df.astype({col: int for col in df.select_dtypes(bool).columns})
df = df.replace([np.inf, -np.inf], np.nan)
print(df.sex_binary.value_counts()) 

sex_binary
1    16391
0    13755
Name: count, dtype: int64


#### Integer conversion

In [8]:
# ------------------------------------------------------------------------------------
# Converts to integer type
# ------------------------------------------------------------------------------------
df.bmi_category_Overweight_Obese = df.bmi_category_Overweight_Obese.astype(int)
df['site_Nairobi'] = df['site_Nairobi'].astype(int)

#### Date conversion

In [9]:
# ------------------------------------------------------------------------------------
# Convert obs_date from string (e.g., '26jul2008') to datetime
# ------------------------------------------------------------------------------------
df['obs_date'] = pd.to_datetime(df['obs_date'], format='%d%b%Y', errors='coerce')

#### Study Periods—Start and End for Projects

In [10]:
# 1. Get study start and end per project (source)
study_periods = (
    df.groupby('source', observed=True)['obs_date']
      .agg(study_start='min', study_end='max')
      .reset_index()
)

# 2. Merge study periods back to original data
df = df.merge(study_periods, on='source', how='left')

# Create dummy variables (one-hot encode)
df = pd.get_dummies(df, columns=['source'], drop_first=True)

In [11]:
# df.head()

Unnamed: 0,individual_id,age,sex,hdss_name,alco_ever,alco_12m,alco_30d,alco_bing_y,tobac_ever,tobac_cur,...,source_Diabetics_Baseline,source_Diabetics_Followup,source_HAALSI_1,source_HAALSI_2,source_HAALSI_3,source_HIV_NCD,source_Nkateko_1,source_Nkateko_2,source_SCALEUP_Clinic_Baseline,source_SCALEUP_Pop_Baseline
0,BBBHY,33,2,Agincourt,,,,,0.0,888.0,...,False,False,False,False,False,False,False,False,False,False
1,BBBHY,34,2,Agincourt,,,,,999.0,,...,False,False,False,False,False,False,False,False,False,False
2,BBBNE,46,1,Agincourt,1.0,1.0,1.0,,1.0,0.0,...,False,False,False,False,False,True,False,False,False,False
3,BBBNE,49,1,Agincourt,,,,,1.0,0.0,...,False,False,False,False,False,False,True,False,False,False
4,BBBNE,50,1,Agincourt,1.0,,1.0,1.0,1.0,0.0,...,False,False,True,False,False,False,False,False,False,False


#### Covariates list

In [12]:
# covariates = ['sex_binary', 'alcohol_use', 'tobacco_use', 'hpt_status_derived', 'diab_status_derived','bmi_category_Overweight_Obese'
#                ,'hiv_status_derived', 'site_Nairobi'] #  + list(site_dummies.columns) 
# ,'obese_status_derived' # alot of missingness
# tb_status_derived # alot of missingness
# , 'res_hha_wealthtertile_2.0', 'res_hha_wealthtertile_3.0'  # (Will add these once we obtain these data points)
# ,'bmi_category_Normal', 'bmi_category_Overweight', 'bmi_category_Obese'
# print(df.offset.describe())
print(df.shape)

(30146, 357)


#### End

In [13]:
# Saved as pickle (faster for large data, preserves types)
df.to_pickle(OUT_DIR / "df_step03_processed.pkl")
study_periods.to_pickle(OUT_DIR / "study_periods.pkl")