## Load and Inspect the Dataset

In [None]:
import pandas as pd
import numpy as np

df_clean = pd.read_csv(r"HealthAI\data\cleaned_data.csv")
print(df_clean.shape)
df_clean.head()

(14622, 48)


Unnamed: 0,age,gender,residence_type,admission_type,length_of_stay_days,icu_stay_days,smoking_status,alcohol_use,diabetes_mellitus,hypertension,...,congenital_heart_disease,urinary_tract_infection,neurocardiogenic_syncope,orthostatic_hypotension,infective_endocarditis,deep_vein_thrombosis,cardiogenic_shock,shock,pulmonary_embolism,chest_infection
0,81,M,R,E,3,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,65,M,R,E,5,2,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,53,M,U,E,3,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,67,F,U,E,8,6,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,60,F,U,E,23,9,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Select Chronic Disease Variables for Association Analysis

In [68]:
binary_cols = [
    'diabetes_mellitus',
    'hypertension',
    'coronary_artery_disease',
    'prior_cardiomyopathy',
    'chronic_kidney_disease']


In [69]:
df_clean[binary_cols].nunique().sort_values()

diabetes_mellitus          2
hypertension               2
coronary_artery_disease    2
prior_cardiomyopathy       2
chronic_kidney_disease     2
dtype: int64

## Prevalence Analysis of Chronic Diseases

In [70]:
prevalence = df_clean[binary_cols].mean().sort_values(ascending=False)

prevalence_df = pd.DataFrame({
    'feature': prevalence.index,
    'prevalence': prevalence.values
})

prevalence_df

Unnamed: 0,feature,prevalence
0,coronary_artery_disease,0.683764
1,hypertension,0.491109
2,diabetes_mellitus,0.324921
3,prior_cardiomyopathy,0.152783
4,chronic_kidney_disease,0.093968


## Explore Numerical Clinical Variables

In [71]:
numerical_cols = [
    'age',
    'length_of_stay_days',
    'icu_stay_days',
    'hemoglobin',
    'total_leukocyte_count',
    'platelet_count',
    'glucose_level',
    'urea_level',
    'creatinine_level'
]

df_clean[numerical_cols].describe()

Unnamed: 0,age,length_of_stay_days,icu_stay_days,hemoglobin,total_leukocyte_count,platelet_count,glucose_level,urea_level,creatinine_level
count,14622.0,14622.0,14622.0,14622.0,14622.0,14622.0,14622.0,14622.0,14622.0
mean,61.400492,6.531733,3.845302,12.249415,11.395477,239.776315,162.659773,49.069562,1.323079
std,13.328654,4.8939,3.943466,2.312775,6.915506,102.777575,84.212982,41.054002,1.176924
min,4.0,1.0,0.0,3.0,0.1,1.38,1.2,0.1,0.065
25%,54.0,3.0,1.0,10.7,7.9,173.0,106.0,25.0,0.75
50%,62.0,5.0,3.0,12.4,10.0,226.0,135.0,35.0,0.98
75%,70.0,8.0,5.0,13.9,13.3,289.0,195.0,56.0,1.4
max,110.0,98.0,48.0,26.5,261.0,1111.0,888.0,495.0,15.63


## Clinical Feature Engineering (Binary Indicators)

In [72]:
df_assoc = df_clean.copy()

df_assoc['glucose_high'] = (df_assoc['glucose_level'] >= 126).astype(int)
df_assoc['creatinine_high'] = (df_assoc['creatinine_level'] > 1.3).astype(int)
df_assoc['urea_high'] = (df_assoc['urea_level'] > 40).astype(int)
df_assoc['hb_low'] = (df_assoc['hemoglobin'] < 12).astype(int)

In [73]:
df_assoc['age_ge_60'] = (df_assoc['age'] >= 60).astype(int)
df_assoc['male'] = (df_assoc['gender'] == 'M').astype(int)

In [74]:
df_assoc['age_ge_60'].value_counts()

age_ge_60
1    8793
0    5829
Name: count, dtype: int64

In [75]:
df_assoc['male'].value_counts()

male
1    9255
0    5367
Name: count, dtype: int64

In [76]:
engineered_cols = [
    'glucose_high',
    'creatinine_high',
    'urea_high',
    'hb_low',
    'age_ge_60',
    'male'
]

df_assoc[engineered_cols].mean()

glucose_high       0.569142
creatinine_high    0.263507
urea_high          0.399740
hb_low             0.422172
age_ge_60          0.601354
male               0.632950
dtype: float64

## Prepare Data for Association Rule Mining

In [77]:
final_assoc_features = engineered_cols + binary_cols

df_final_assoc = df_assoc[final_assoc_features]

In [78]:
df_final_assoc = df_final_assoc.astype(bool)

In [79]:
df_final_assoc

Unnamed: 0,glucose_high,creatinine_high,urea_high,hb_low,age_ge_60,male,diabetes_mellitus,hypertension,coronary_artery_disease,prior_cardiomyopathy,chronic_kidney_disease
0,False,False,False,True,True,True,True,False,False,False,False
1,False,False,False,False,True,True,False,True,True,False,False
2,True,True,True,True,False,True,True,False,True,False,False
3,True,False,False,False,True,False,False,True,True,False,False
4,True,False,True,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
14617,True,True,True,True,True,False,True,True,True,False,False
14618,True,True,True,False,False,True,True,True,False,True,False
14619,True,True,True,True,True,True,False,True,True,False,False
14620,True,False,False,False,False,False,False,True,True,False,False


In [80]:
df_final_assoc.nunique()

glucose_high               2
creatinine_high            2
urea_high                  2
hb_low                     2
age_ge_60                  2
male                       2
diabetes_mellitus          2
hypertension               2
coronary_artery_disease    2
prior_cardiomyopathy       2
chronic_kidney_disease     2
dtype: int64

## Generate Association Rules

In [81]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(
    df_final_assoc,
    min_support=0.03,   # 3% minimum support
    use_colnames=True
)


In [82]:
frequent_itemsets.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets
8,0.683764,(coronary_artery_disease)
5,0.632950,(male)
4,0.601354,(age_ge_60)
0,0.569142,(glucose_high)
7,0.491109,(hypertension)
...,...,...
291,0.030023,"(prior_cardiomyopathy, hb_low, diabetes_mellit..."
518,0.030023,"(hb_low, chronic_kidney_disease, diabetes_mell..."
530,0.030023,"(age_ge_60, hypertension, urea_high, glucose_h..."
684,0.030023,"(male, hb_low, hypertension, diabetes_mellitus..."


## Filter Rules by Clinical Outcomes (Chronic Diseases)

In [83]:
rules = association_rules(
    frequent_itemsets,
    metric='confidence',
    min_threshold=0.6
)

In [84]:
rules = rules[
    rules['consequents'].apply(
        lambda x: all(item in binary_cols for item in x)
    )
]

In [85]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
8,(glucose_high),(coronary_artery_disease),0.569142,0.683764,0.413076,0.725787,1.061458,1.0,0.023917,1.153249,0.134383,0.491857,0.132885,0.664954
16,(creatinine_high),(coronary_artery_disease),0.263507,0.683764,0.178977,0.679211,0.993341,1.0,-0.001200,0.985806,-0.009020,0.232954,-0.014398,0.470482
21,(urea_high),(coronary_artery_disease),0.399740,0.683764,0.271919,0.680240,0.994845,1.0,-0.001409,0.988977,-0.008558,0.335047,-0.011146,0.538960
25,(hb_low),(coronary_artery_disease),0.422172,0.683764,0.285871,0.677142,0.990316,1.0,-0.002796,0.979490,-0.016642,0.348595,-0.020939,0.547613
30,(age_ge_60),(coronary_artery_disease),0.601354,0.683764,0.439748,0.731264,1.069467,1.0,0.028564,1.176751,0.162939,0.520184,0.150202,0.687196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,"(male, hb_low, age_ge_60, hypertension, urea_h...",(coronary_artery_disease),0.039598,0.683764,0.034332,0.867012,1.267999,1.0,0.007256,2.377929,0.220070,0.049826,0.579466,0.458611
2883,"(hb_low, age_ge_60, coronary_artery_disease, d...",(hypertension),0.050951,0.491109,0.036657,0.719463,1.464976,1.0,0.011635,1.813988,0.334434,0.072530,0.448729,0.397052
2884,"(hb_low, age_ge_60, hypertension, diabetes_mel...",(coronary_artery_disease),0.045753,0.683764,0.036657,0.801196,1.171743,1.0,0.005373,1.590690,0.153598,0.052907,0.371342,0.427403
2892,"(hb_low, male, age_ge_60, coronary_artery_dise...",(hypertension),0.053071,0.491109,0.034605,0.652062,1.327733,1.0,0.008542,1.462590,0.260670,0.067910,0.316281,0.361263


In [86]:
rules = rules[rules['lift'] > 1.2]

In [87]:
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']] \
    .sort_values('lift', ascending=False) \
    .head(10)


Unnamed: 0,antecedents,consequents,support,confidence,lift
965,"(urea_high, chronic_kidney_disease, coronary_a...",(hypertension),0.045821,0.73224,1.490993
1802,"(urea_high, creatinine_high, chronic_kidney_di...",(hypertension),0.044659,0.732063,1.490631
2325,"(chronic_kidney_disease, coronary_artery_disea...",(hypertension),0.032212,0.730233,1.486904
1495,"(urea_high, glucose_high, chronic_kidney_disea...",(hypertension),0.032896,0.729894,1.486215
841,"(creatinine_high, chronic_kidney_disease, coro...",(hypertension),0.045616,0.728962,1.484317
300,"(chronic_kidney_disease, coronary_artery_disease)",(hypertension),0.047052,0.728042,1.482445
1870,"(creatinine_high, hb_low, chronic_kidney_disea...",(hypertension),0.036589,0.727891,1.482137
2632,"(hb_low, coronary_artery_disease, urea_high, c...",(hypertension),0.035905,0.727147,1.480621
1004,"(hb_low, chronic_kidney_disease, coronary_arte...",(hypertension),0.037683,0.726913,1.480145
1345,"(chronic_kidney_disease, creatinine_high, gluc...",(hypertension),0.032759,0.726859,1.480035


In [88]:
rules_filtered = rules[
    (rules['lift'] >= 1.2) &
    (rules['confidence'] >= 0.6) &
    (rules['support'] >= 0.05)
]

rules_filtered = rules_filtered.sort_values(
    by=['lift', 'confidence', 'support'],
    ascending=False
)


In [89]:
rules_filtered.shape

(144, 14)

In [90]:
rules_filtered["consequents"].value_counts()

consequents
(hypertension)               97
(coronary_artery_disease)    47
Name: count, dtype: int64

In [91]:
rules_filtered = rules_filtered[
    (rules_filtered['antecedents'].apply(len) <= 2) &
    (rules_filtered['consequents'].apply(len) == 1)
]
rules_filtered.shape

(20, 14)

In [92]:
rules_filtered

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
249,"(hb_low, coronary_artery_disease)",(hypertension),0.285871,0.491109,0.192928,0.67488,1.374196,1.0,0.052535,1.565241,0.381306,0.330328,0.361121,0.533861
181,"(creatinine_high, coronary_artery_disease)",(hypertension),0.178977,0.491109,0.119751,0.669087,1.362399,1.0,0.031854,1.537837,0.323987,0.217597,0.349736,0.456462
225,"(urea_high, coronary_artery_disease)",(hypertension),0.271919,0.491109,0.17713,0.651408,1.326402,1.0,0.043588,1.459848,0.337986,0.302323,0.314997,0.506041
294,"(diabetes_mellitus, coronary_artery_disease)",(hypertension),0.245589,0.491109,0.159417,0.649123,1.321748,1.0,0.038806,1.450339,0.322671,0.276152,0.310506,0.486865
245,"(hb_low, diabetes_mellitus)",(hypertension),0.168308,0.491109,0.108535,0.64486,1.313068,1.0,0.025877,1.432929,0.286674,0.19702,0.302129,0.43293
272,"(age_ge_60, coronary_artery_disease)",(hypertension),0.439748,0.491109,0.28163,0.640435,1.304059,1.0,0.065666,1.415297,0.416176,0.433793,0.293435,0.606947
178,"(creatinine_high, diabetes_mellitus)",(hypertension),0.113528,0.491109,0.071946,0.633735,1.290415,1.0,0.016192,1.389406,0.253878,0.135062,0.280268,0.390116
266,"(diabetes_mellitus, age_ge_60)",(hypertension),0.214061,0.491109,0.134728,0.629393,1.281574,1.0,0.029601,1.373128,0.27955,0.236183,0.271736,0.451864
115,"(glucose_high, coronary_artery_disease)",(hypertension),0.413076,0.491109,0.258378,0.625497,1.273641,1.0,0.055512,1.358842,0.36606,0.400085,0.264079,0.575804
286,"(male, hypertension)",(coronary_artery_disease),0.293188,0.683764,0.25448,0.867973,1.269404,1.0,0.054008,2.395235,0.300262,0.352234,0.582504,0.620074


In [93]:
rules_filtered['rule_key'] = rules_filtered.apply(
    lambda r: tuple(sorted(list(r['antecedents']) + list(r['consequents']))),
    axis=1
)

rules_filtered = rules_filtered.sort_values(
    by='confidence', ascending=False
).drop_duplicates(
    subset='rule_key', keep='first'
)

rules_filtered = rules_filtered.drop(columns='rule_key')


In [94]:
rules_filtered

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
286,"(male, hypertension)",(coronary_artery_disease),0.293188,0.683764,0.25448,0.867973,1.269404,1.0,0.054008,2.395235,0.300262,0.352234,0.582504,0.620074
117,"(glucose_high, hypertension)",(coronary_artery_disease),0.304404,0.683764,0.258378,0.848798,1.241361,1.0,0.050237,2.091479,0.279519,0.354044,0.52187,0.613337
274,"(age_ge_60, hypertension)",(coronary_artery_disease),0.332444,0.683764,0.28163,0.847151,1.238952,1.0,0.054317,2.06894,0.288914,0.383391,0.516661,0.629517
41,(hypertension),(coronary_artery_disease),0.491109,0.683764,0.415675,0.8464,1.237854,1.0,0.079872,2.05883,0.377587,0.547518,0.514287,0.727161
250,"(hb_low, hypertension)",(coronary_artery_disease),0.228218,0.683764,0.192928,0.84537,1.236347,1.0,0.036881,2.045114,0.247694,0.268309,0.51103,0.563763
226,"(urea_high, hypertension)",(coronary_artery_disease),0.214608,0.683764,0.17713,0.825366,1.207092,1.0,0.030389,1.810854,0.218442,0.245591,0.447774,0.542209
181,"(creatinine_high, coronary_artery_disease)",(hypertension),0.178977,0.491109,0.119751,0.669087,1.362399,1.0,0.031854,1.537837,0.323987,0.217597,0.349736,0.456462
294,"(diabetes_mellitus, coronary_artery_disease)",(hypertension),0.245589,0.491109,0.159417,0.649123,1.321748,1.0,0.038806,1.450339,0.322671,0.276152,0.310506,0.486865
245,"(hb_low, diabetes_mellitus)",(hypertension),0.168308,0.491109,0.108535,0.64486,1.313068,1.0,0.025877,1.432929,0.286674,0.19702,0.302129,0.43293
178,"(creatinine_high, diabetes_mellitus)",(hypertension),0.113528,0.491109,0.071946,0.633735,1.290415,1.0,0.016192,1.389406,0.253878,0.135062,0.280268,0.390116


In [95]:
rules_filtered = rules_filtered[rules_filtered['lift'] >= 1.2]

In [96]:
rules_final = rules_filtered.sort_values(
    by=['lift', 'confidence'],
    ascending=False
)

## Final Selected Rules and Interpretation

In [98]:
rules_final["consequents"].value_counts()

consequents
(hypertension)               9
(coronary_artery_disease)    6
Name: count, dtype: int64

### Final Association Rules (Medical Associations)

After generating association rules using the Apriori algorithm and filtering them based on minimum support, confidence, and lift thresholds, a small set of representative rules was selected for final interpretation.

**Rule selection was based on:**
- **Clinical diversity**: covering metabolic, renal, demographic, and cardiovascular factors  
- **Non-redundancy**: avoiding repeated variants of the same association  
- **Stakeholder clarity**: ensuring rules are easily interpretable in a clinical context  
- **Statistical strength**: all rules have support ≥ 5%, confidence ≥ 0.6, and lift > 1.2  

---

#### Selected Rules and Interpretations

1. **{creatinine_high, coronary_artery_disease} → {hypertension}**  
   Patients with coronary artery disease and elevated creatinine levels frequently also have hypertension, indicating a strong renal–cardiovascular comorbidity pattern.

2. **{diabetes_mellitus, coronary_artery_disease} → {hypertension}**  
   Diabetes and coronary artery disease commonly co-occur with hypertension, reflecting clustering of chronic cardiometabolic conditions.

3. **{diabetes_mellitus, age_ge_60} → {hypertension}**  
   Among older adults with diabetes, hypertension appears more frequently than expected, highlighting age as a risk amplifier.

4. **{hypertension} → {coronary_artery_disease}**  
   Hypertension is strongly associated with coronary artery disease and emerges as a central comorbid condition in the dataset.

5. **{chronic_kidney_disease} → {hypertension}**  
   Chronic kidney disease frequently co-occurs with hypertension, consistent with known renal–blood pressure relationships.

---

These associations represent **co-occurrence patterns**, not causal or predictive relationships, and are intended to support clinical insight and awareness of common comorbidity clusters.
