## Stroke Work
<br>Author: Daniel Maina Nderitu<br>
Project: MADIVA<br>
Purpose: Survival logic (VERY IMPORTANT)<br>
Notes:   How stroke incidence was constructed

#### Bootstrap cell

In [44]:
# =================== BOOTSTRAP CELL ===================
# Standard setup for all notebooks
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config.variables import COVARIATES

# ========================================================
# Optional for warnings and nicer plots
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

import sys
from pathlib import Path

# ========================================================
# 1️⃣ Ensure project root is in Python path
# Adjust this if your notebooks are nested deeper
PROJECT_ROOT = Path.cwd().parents[0]  # assumes notebooks are in a subfolder
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ========================================================
# 2️⃣ Import helper to load paths
from src.utils.helpers import load_paths

# ========================================================
# 3️⃣ Load paths from config.yaml (works regardless of notebook location)
paths = load_paths()

# ========================================================
# 4️⃣ Optionally, print paths to confirm
for key, value in paths.items():
    print(f"{key}: {value}")

# ========================================================
# 5️⃣ Now you can use these paths in your notebook:
# Example:
DATA_DIR = paths['DATA_DIR']
OUT_DIR = paths['OUT_DIR']
FIG_DIR = paths['FIG_DIR']

# ========================================================

BASE_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work
DATA_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\data
OUT_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\model_output
FIG_DIR: D:\APHRC\GoogleDrive_ii\stata_do_files\madiva\stroke_work\visualization


### Import data - from previous step

In [None]:
# data saved as pickle:
df = pd.read_pickle(OUT_DIR / "df_step03_processed.pkl")

### Event, time_at_risk (person-years), covariates

Treatment to ensure we don't inflate incidence

#### obs_date Parsing, Study Periods, Start date and study end

In [45]:
df.head()

Unnamed: 0,individual_id,age,sex,hdss_name,alco_ever,alco_12m,alco_30d,alco_bing_y,tobac_ever,tobac_cur,tobac_smkls_ever,tobac_p_daily,tobac_freq,fd_fruitservings,fd_meals_outhome_ave,fd_diet_change,fd_fruit_days,fd_veg_days,fd_eat_drink_12h,fd_oil_type,fd_veg_servings,actv_walk_bicycle_days,actv_vigorous_sports_min,actv_mpa_minutes,actv_moderate_sports_hrs,actv_vigorous_sports,actv_mpa_days,actv_vpa_days,actv_sitting_or_standing,actv_walk_bicycle_min,actv_vpa_hrs,actv_walk_bicycle_hrs,actv_walk_bicycle,actv_moderate_sports_days,actv_mpa_hrs,actv_sitting_min,actv_moderate_sports_min,actv_sleeping_hrs,actv_vigorous_sports_days,actv_moderate_sports,actv_vpa_minutes,actv_vpa,actv_mpa,actv_vigorous_sports_hrs,actv_mvpa,actv_sitting_hrs,stroke_ever,stroke_numb,stroke_wkness,stroke_paralysis_ever,stroke_blind,stroke_trans_isc_ever,stroke_parents,stroke_sibling,stroke_other,stroke_children,stroke_fhx,hpt_ever,hpt_rx_ever,hpt_12m,bp_sys,bp_dia,bg_mmol_fst,bg_mmol_random,diab_hx,diab_12m,diab_rx_ever,diab_rx_current,diab_rx_other,diab_rx_2w,diab_rx_12m,diab_rx_trad_curr,diab_rx_other_2w,kidney_rx,pi_acr,stroke_undstn,stroke_verbal,vision_problem,stroke_hl_vis,sex_no_condom_hiv_person,hiv_tested_when_d,hiv_rx_trad_ever,hiv_counselled,hiv_tested,hiv_tested_when_c,hiv_tested_month,hiv_tested_yr,hiv_status_slf_rpt,hiv_test_result,stroke_status_derived,hiv_status_derived,hpt_status_derived,obese_status_derived,diab_status_derived,tb_status_derived,bmi,bmi_refined,is_outlier,alcohol_use,tobacco_use,stroke_status_derived_age,hpt_status_derived_age,diab_status_derived_age,hiv_status_derived_age,stroke_status_derived_incident,hpt_status_derived_incident,diab_status_derived_incident,hiv_status_derived_incident,obs_date,dod,year_stroke,n_records,record_type,site,res_round_intvw,res_round_event,res_datebeg,res_eventdate,res_calendar,res_event,residence,res_ind_ageday,res_ind_ageweek,res_ind_agemonth,res_ind_ageyear,res_ind_agegroup,res_ind_reltohhh,res_ind_gender,res_ind_datebirth,res_ind_dateintodsa,res_ind_dateoutofdsa,res_ind_firstevent,res_ind_lastevent,res_ind_birth_areatype,res_ind_agefirstbirth,res_ind_agefirstmarr,res_ind_agefirstsex,res_ind_ethnicity,res_agebeg,res_hhh_datebirth,res_hhh_gender,res_hhh_ethnicity,res_hha_hhadone,res_hha_sourceyear_wealth,res_hha_wealthindex,res_hha_wealthtertile,res_hha_wealthquintile,res_hha_sourceyear_foodsec,res_hha_hhdhungerscale,res_hha_sourceyear_povertyline,res_hha_isbelowpovertyline,edu_everschool,edu_everschool_level,edu_everschool_class,edu_currschool,edu_currschool_level,edu_currschool_class,edu_currschool_type,gbs_preg_dur,gbs_gravidity,gbs_parity,gbs_totalceb,gbs_multiplebirth,iga_hasiga,iga_noigareason,iga_typeofiga,iga_inc30days_cash,iga_inc30days_kind,los_chddth_allsex_anyage,los_chddth_allsex_under1yrs,los_chddth_allsex_under5yrs,los_chddth_allsex_5to9yrs,los_chddth_allsex_under10yrs,los_chddth_allsex_10to14yrs,los_chddth_allsex_15to19yrs,los_chddth_allsex_20to24yrs,los_chddth_allsex_25to29yrs,los_chddth_allsex_30to34yrs,los_chddth_allsex_35to39yrs,los_chddth_allsex_40to44yrs,los_chddth_allsex_45to49yrs,los_chddth_allsex_50to54yrs,los_chddth_allsex_55to59yrs,los_chddth_allsex_60plusyrs,los_chddth_femsex_anyage,los_chddth_femsex_under1yrs,los_chddth_femsex_under5yrs,los_chddth_femsex_5to9yrs,los_chddth_femsex_under10yrs,los_chddth_femsex_10to14yrs,los_chddth_femsex_15to19yrs,los_chddth_femsex_20to24yrs,los_chddth_femsex_25to29yrs,los_chddth_femsex_30to34yrs,los_chddth_femsex_35to39yrs,los_chddth_femsex_40to44yrs,los_chddth_femsex_45to49yrs,los_chddth_femsex_50to54yrs,los_chddth_femsex_55to59yrs,los_chddth_femsex_60plusyrs,los_chddth_malsex_anyage,los_chddth_malsex_under1yrs,los_chddth_malsex_under5yrs,los_chddth_malsex_5to9yrs,los_chddth_malsex_under10yrs,los_chddth_malsex_10to14yrs,los_chddth_malsex_15to19yrs,los_chddth_malsex_20to24yrs,los_chddth_malsex_25to29yrs,los_chddth_malsex_30to34yrs,los_chddth_malsex_35to39yrs,los_chddth_malsex_40to44yrs,los_chddth_malsex_45to49yrs,los_chddth_malsex_50to54yrs,los_chddth_malsex_55to59yrs,los_chddth_malsex_60plusyrs,los_pardth_allsex_anyage,los_pardth_allsex_10to14yrs,los_pardth_allsex_15to19yrs,los_pardth_allsex_20to24yrs,los_pardth_allsex_25to29yrs,los_pardth_allsex_30to34yrs,los_pardth_allsex_35to39yrs,los_pardth_allsex_40to44yrs,los_pardth_allsex_45to49yrs,los_pardth_allsex_50to54yrs,los_pardth_allsex_55to59yrs,los_pardth_allsex_60plusyrs,los_pardth_femsex_anyage,los_pardth_femsex_10to14yrs,los_pardth_femsex_15to19yrs,los_pardth_femsex_20to24yrs,los_pardth_femsex_25to29yrs,los_pardth_femsex_30to34yrs,los_pardth_femsex_35to39yrs,los_pardth_femsex_40to44yrs,los_pardth_femsex_45to49yrs,los_pardth_femsex_50to54yrs,los_pardth_femsex_55to59yrs,los_pardth_femsex_60plusyrs,los_pardth_malsex_anyage,los_pardth_malsex_10to14yrs,los_pardth_malsex_15to19yrs,los_pardth_malsex_20to24yrs,los_pardth_malsex_25to29yrs,los_pardth_malsex_30to34yrs,los_pardth_malsex_35to39yrs,los_pardth_malsex_40to44yrs,los_pardth_malsex_45to49yrs,los_pardth_malsex_50to54yrs,los_pardth_malsex_55to59yrs,los_pardth_malsex_60plusyrs,los_parindividualid1,los_hhhdth_allsex_anyage,los_hhhdth_allsex_under10yrs,los_hhhdth_allsex_10to14yrs,los_hhhdth_allsex_15to19yrs,los_hhhdth_allsex_20to24yrs,los_hhhdth_allsex_25to29yrs,los_hhhdth_allsex_30to34yrs,los_hhhdth_allsex_35to39yrs,los_hhhdth_allsex_40to44yrs,los_hhhdth_allsex_45to49yrs,los_hhhdth_allsex_50to54yrs,los_hhhdth_allsex_55to59yrs,los_hhhdth_allsex_60plusyrs,los_hhhdth_femsex_anyage,los_hhhdth_femsex_under10yrs,los_hhhdth_femsex_10to14yrs,los_hhhdth_femsex_15to19yrs,los_hhhdth_femsex_20to24yrs,los_hhhdth_femsex_25to29yrs,los_hhhdth_femsex_30to34yrs,los_hhhdth_femsex_35to39yrs,los_hhhdth_femsex_40to44yrs,los_hhhdth_femsex_45to49yrs,los_hhhdth_femsex_50to54yrs,los_hhhdth_femsex_55to59yrs,los_hhhdth_femsex_60plusyrs,los_hhhdth_malsex_anyage,los_hhhdth_malsex_under10yrs,los_hhhdth_malsex_10to14yrs,los_hhhdth_malsex_15to19yrs,los_hhhdth_malsex_20to24yrs,los_hhhdth_malsex_25to29yrs,los_hhhdth_malsex_30to34yrs,los_hhhdth_malsex_35to39yrs,los_hhhdth_malsex_40to44yrs,los_hhhdth_malsex_45to49yrs,los_hhhdth_malsex_50to54yrs,los_hhhdth_malsex_55to59yrs,los_hhhdth_malsex_60plusyrs,los_hhhindividualid1,los_hhhindividualid2,mar_maritalstatus,par_parentsalive,par_parentsstayinhhd,par_momareatype,par_popareatype,pge_pgo_bth_id,pge_pregid,pge_ispregnant,pge_pregstatus,pge_preg_dur,pge_pregoutcome,pge_adversepregoutcome,pge_multiplebirth,pge_gravidity,pge_parity,pge_totalceb,rel_religionstatus,res_bth_preg_dur,res_bth_delivplace,res_bth_delivassisted,res_bth_multiplebirth,res_pcod_under_final,res_pcodbroad_under_final,res_pcodgeneral_under_final,res_icod5_1,res_icod5broad_1,res_icod5general_1,res_everoutmigrated,censor_isoutmvmt,censor_isinmvmt,censor_birth,censor_death,censor_maternaldeath,censor_out,censor_in,censor_gavebirth,censor_impregnated,censor_conceived,education_level,education_years,marital_status,currently_working,_merge,dup,sex_binary,bmi_category_Overweight_Obese,site_Nairobi,study_start,study_end,source_ARK_2,source_AWIGEN_1,source_AWIGEN_2,source_Assess_Linkages_Main,source_Diabetics_Baseline,source_Diabetics_Followup,source_HAALSI_1,source_HAALSI_2,source_HAALSI_3,source_HIV_NCD,source_Nkateko_1,source_Nkateko_2,source_SCALEUP_Clinic_Baseline,source_SCALEUP_Pop_Baseline
0,BBBHY,33,2,Agincourt,,,,,0.0,888.0,,,888.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,888.0,888.0,888.0,888.0,0.0,0.0,0.0,,122.0,84.0,,5.7,0,,,888.0,,,,,,999.0,0.4,,,,,,,,,1.0,,,,0.0,999.0,0,0,0,0.0,0,999.0,30.0,30.0,0,888,0.0,33,33,33,33,0,0,0,0,2018-07-25,,2018,2,2,agincourt,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1,0,2017-11-02,2019-01-11,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,BBBHY,34,2,Agincourt,,,,,999.0,,,,,,,,-999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,110.667,77.666702,,,0,,,888.0,,,,,,888.0,0.7,,,,,,,,,1.0,,,,,888.0,0,888,0,0.0,0,,28.0,28.0,0,888,999.0,34,34,34,34,0,0,0,0,2019-02-26,,2019,2,2,agincourt,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1,0,2018-10-25,2019-08-14,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,BBBNE,46,1,Agincourt,1.0,1.0,1.0,,1.0,0.0,,,,1.0,0.0,,2.0,7.0,,1.0,1.0,6.0,-999.0,-888.0,,0.0,-888.0,5.0,,0.0,4.0,3.0,1.0,,-888.0,4.0,,,-888.0,,0.0,1.0,0.0,-888.0,,4.0,0.0,888.0,,,,,,,,,,888.0,,999.0,112.667,73.666702,,5.2,888,999.0,,,,999.0,999.0,999.0,,,,,,,,0.0,,,,1.0,,11.0,2010.0,,1.0,0,1,0,0.0,0,,,,0,1,1.0,46,46,46,46,0,0,0,1,2011-02-08,,2011,5,2,agincourt,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,2010-08-13,2011-06-10,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,BBBNE,49,1,Agincourt,,,,,1.0,0.0,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,0.0,0.0,,,129.0,82.666702,,4.9,0,,,,0.0,888.0,,,888.0,,,,,,,,,,,,,,,,,0,1,0,,0,,,,0,888,1.0,49,49,49,46,0,0,0,0,2013-11-08,,2013,5,2,agincourt,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,2013-10-01,2013-12-08,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,BBBNE,50,1,Agincourt,1.0,,1.0,1.0,1.0,0.0,0.0,,888.0,1.0,-999.0,0.0,1.0,6.0,,,3.0,5.0,-999.0,-999.0,-999.0,1.0,-999.0,-999.0,888.0,-999.0,-999.0,-999.0,1.0,2.0,-999.0,-999.0,,,1.0,1.0,-999.0,888.0,999.0,-999.0,,-999.0,0.0,,,,,,,,,,,1.0,1.0,1.0,123.5,80.5,,8.6,1,1.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,888.0,0.0,,,1.0,1.0,3.0,,,1.0,1.0,0,1,1,0.0,1,1.0,18.937,18.9,0,1,1.0,50,50,50,46,0,1,1,0,2014-12-06,,2014,5,2,agincourt,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,2011-07-19,2015-11-30,False,False,False,False,False,False,True,False,False,False,False,False,False,False


#### time_at_risk, offset

In [46]:
# ---------------------------------------------------------------------------
print("Before")
print(df.shape)
# ---------------------------------------------------------------------------

# including only periods where the person is under observation — between entry-type events and exit-type events.
# valid_events = ['enumeration', 'birth', 'inmigration', 'entry', 'exit', 'death', 'outmigration', 'OBE', 'time-varying covariate']
# valid_events = [ -2, -1, 1, 2, 4, 6] #  10   (Entry events)
# df = df[df['res_event'].isin(valid_events)].copy()

# ---------------------------------------------------------------------------
# Sorting based on an individual and observation date
# ---------------------------------------------------------------------------
df = df.sort_values(['individual_id', 'obs_date'])

# ---------------------------------------------------------------------------
# Preparing time intervals during which a person is at risk
# ---------------------------------------------------------------------------
# df['start_date'] = df.groupby('individual_id')['obs_date'].transform('min')
# df['end_date'] = df.groupby('individual_id')['obs_date'].transform('max')
df['start_date'] = df['obs_date']
df['end_date']   = df['study_end']

# ---------------------------------------------------------------------------
# Computing time at risk (person-time) in years
# ---------------------------------------------------------------------------
df['time_at_risk'] = (df['end_date'] - df['start_date']).dt.days / 30.4375 # 365.25
df['time_at_risk'] = df['time_at_risk'].fillna(df['time_at_risk'].median()) # df['time_at_risk'] = df['time_at_risk'].fillna(1.5) 

# ---------------------------------------------------------------------------
missing_time_count = df['time_at_risk'].isna().sum()
total_count = len(df)
missing_percent = (missing_time_count / total_count) * 100

print(f"Missing time_at_risk before imputation: {missing_time_count:,} ({missing_percent:.2f}%)")

# ---------------------------------------------------------------------------
# If you have a site variable, examine missingness by site
# ---------------------------------------------------------------------------
if 'site' in df.columns:
    missing_by_site = df.groupby('hdss_name')['time_at_risk'].apply(lambda x: x.isna().mean() * 100).round(2)
    print("\nMissing time_at_risk by site (%):")
    print(missing_by_site)

# ---------------------------------------------------------------------------

# # df = df.apply(pd.to_numeric, errors='coerce')
# df['time_at_risk'] = pd.to_numeric(df['time_at_risk'], errors='coerce')

# ---------------------------------------------------------------------------
df.loc[df['time_at_risk'] < 0.01, 'time_at_risk'] = 0.01  # avoid log(0)  [Doing some house cleaning to avoid negative offset values]

# ---------------------------------------------------------------------------
# Creating offset (Added a tiny constant (epsilon) since I was encountering some negative offset values. This is a robust and common approach in survival and Poisson models.)
# ---------------------------------------------------------------------------
df['offset'] = np.log(df['time_at_risk']) #.clip(lower=1e-6)) 

# ---------------------------------------------------------------------------
# Defining stroke event (stroke occurrence)
# df['stroke_prev'] = df.groupby('individual_id')['stroke_status_derived'].shift(1)
df['stroke_prev'] = df.groupby('individual_id')['stroke_status_derived'].shift(1).fillna(0) # A great assumption here (people didn't have stroke previously)
df['event'] = np.where((df['stroke_prev'] == 0) & (df['stroke_status_derived'] == 1), 1, 0)
df['event'] = df['event'].fillna(0)

# ---------------------------------------------------------------------------
# finding first event per person  (different from prevalence)
df['cumulative_stroke'] = df.groupby('individual_id')['stroke_status_derived'].cumsum()
# keeping only records before or up to the first stroke
df = df[df['cumulative_stroke'] <= 1]

# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------

print("After filtering to <= first stroke, rows:", df.shape[0])

# ---------------------------
# 3. How many times is each person represented?
# ---------------------------
counts = df['individual_id'].value_counts()          # number of rows per person (index = individual_id)
counts_summary = counts.describe()                   # mean, min, max, median, etc.

print("\nRecords per individual (summary):")
print(counts_summary)

# How many individuals have only one record (single-visit)?
n_single = (counts == 1).sum()
pct_single = (n_single / counts.shape[0]) * 100

print(f"\nIndividuals with a single record: {n_single} ({pct_single:.1f}%)")

# Show frequency distribution (top few)
print("\nTop frequency counts (number of Agincourt persons with X records):")
freq_table = counts.value_counts().sort_index()      # index = number of records, value = # persons
print(freq_table.head(20))  # show first 20 rows; increase if needed

# # If you want the full distribution dataframe:
# freq_df = freq_table.reset_index().rename(columns={'index': 'n_records', 'individual_id': 'n_persons'})
# Create a clean dataframe version
freq_df = freq_table.reset_index(name="n_persons")
freq_df.columns = ['n_records', 'n_persons']  # rename safely

# ---------------------------
# 4. (Optional) If you intend to use months offsets in models:
#     prepare X,y,offset using offset_months
# ---------------------------
# Example:
# offset_for_model = df['offset_months']
# y_for_model = df['event']
# X_for_model = df[covariates_present]  # after your usual pre-processing


Before
(30146, 357)
Missing time_at_risk before imputation: 0 (0.00%)

Missing time_at_risk by site (%):
hdss_name
Agincourt    0.0
Nairobi      0.0
Name: time_at_risk, dtype: float64
After filtering to <= first stroke, rows: 28822

Records per individual (summary):
count    9525.000000
mean        3.025932
std         1.383764
min         1.000000
25%         2.000000
50%         2.000000
75%         4.000000
max         9.000000
Name: count, dtype: float64

Individuals with a single record: 366 (3.8%)

Top frequency counts (number of Agincourt persons with X records):
count
1     366
2    4407
3    1877
4    1199
5     974
6     581
7     111
8       8
9       2
Name: count, dtype: int64


#### Stroke transition

In [47]:
# Identify individuals who experienced a stroke (event == 1)
stroke_cases = df[df['event'] == 1]['individual_id'].unique()

# Subset full data for only those individuals
df_stroke_transition = df[df['individual_id'].isin(stroke_cases)]
selected = ['individual_id', 'start_date', 'end_date', 'stroke_prev', 'event'
            , 'time_at_risk', 'cumulative_stroke', 'offset'] # , 'res_eventdate'

check_df = df_stroke_transition[selected].sort_values(['individual_id', 'start_date']).head(20)
check_df.describe()

Unnamed: 0,start_date,end_date,stroke_prev,event,time_at_risk,cumulative_stroke,offset
count,20,20,20.0,20.0,20.0,20.0,20.0
mean,2012-08-14 12:00:00,2013-01-11 01:12:00,0.0,0.5,4.913347,0.5,1.433005
min,2008-07-22 00:00:00,2009-03-30 00:00:00,0.0,0.0,2.234086,0.0,0.803832
25%,2012-08-10 00:00:00,2012-12-19 00:00:00,0.0,0.0,2.61191,0.0,0.959533
50%,2012-09-29 00:00:00,2012-12-19 00:00:00,0.0,0.5,4.106776,0.5,1.412606
75%,2013-06-02 12:00:00,2014-10-09 18:00:00,0.0,1.0,4.854209,1.0,1.573802
max,2015-08-06 00:00:00,2015-11-30 00:00:00,0.0,1.0,19.778234,1.0,2.984582
std,,,0.0,0.512989,3.82627,0.512989,0.516849


#### End

In [42]:
# Saved as pickle (faster for large data, preserves types)
df.to_pickle(OUT_DIR / "df_step04_processed.pkl")