**Association Rule Mining**

In [1]:
import os
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent

DATA_PATH = PROJECT_ROOT / "data" / "processed" / "CDC_Diabetes_Dataset_clean.csv"

FIG_DIR = PROJECT_ROOT / "figures" / "results_ARM"
FIG_DIR.mkdir(parents=True, exist_ok=True)


print("Project root directory:", PROJECT_ROOT)
print("Data path exists:", DATA_PATH.exists())
assert DATA_PATH.exists(), f"Data file not found at {DATA_PATH}"
print("Figures directory:", FIG_DIR)

Project root directory: /Users/dawoodbutt/Desktop/10. UNI/4. Data Mining/individual assessment/diabetes-risk
Data path exists: True
Figures directory: /Users/dawoodbutt/Desktop/10. UNI/4. Data Mining/individual assessment/diabetes-risk/figures/results_ARM


In [2]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules


pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_columns", 200)

In [3]:
# Load data
# load data from csv
df = pd.read_csv(DATA_PATH)
print("Data loaded successfully.")
print("Top 5 rows:")
df.head()

Data loaded successfully.
Top 5 rows:


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
# target definition + overview 

# Define binary diabetes target for ARM
df = df.copy()
df["Diabetes"] = (df["Diabetes_012"] > 0).astype(int)

# Display class distribution
target_summary = (
    df["Diabetes"]
    .value_counts(normalize=True)
    .rename("proportion")
    .to_frame()
)

display(target_summary)

print("Target definition:")
print("0 = No diabetes")
print("1 = Pre-diabetes or diabetes")

Unnamed: 0_level_0,proportion
Diabetes,Unnamed: 1_level_1
0,0.827114
1,0.172886


Target definition:
0 = No diabetes
1 = Pre-diabetes or diabetes


In [5]:

# Target 
target = "Diabetes"

# Binary health indicators (0/1)
binary_features = [
    "HighBP", "HighChol", "CholCheck", "Smoker", "Stroke",
    "HeartDiseaseorAttack", "PhysActivity", "Fruits", "Veggies",
    "HvyAlcoholConsump", "AnyHealthcare", "NoDocbcCost",
    "DiffWalk", "Sex"
]

# Ordinal / categorical features (coded levels)
ordinal_features = [
    "GenHlth",    # 1 (Excellent) – 5 (Poor)
    "Age",        # age brackets (1–13)
    "Education",  # education levels
    "Income"      # income brackets
]

# Continuous features
continuous_features = [
    "BMI",
    "MentHlth",
    "PhysHlth"
]

# Sanity check: confirm all features exist
all_features = binary_features + ordinal_features + continuous_features + [target]
missing = [c for c in all_features if c not in df.columns]

print("Missing columns:", missing)
print("\nBinary features:", binary_features)
print("\nOrdinal features:", ordinal_features)
print("\nContinuous features:", continuous_features)

Missing columns: []

Binary features: ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']

Ordinal features: ['GenHlth', 'Age', 'Education', 'Income']

Continuous features: ['BMI', 'MentHlth', 'PhysHlth']


In [6]:
# create binned / labelled versions for ARM itemisation

arm = df.copy()

# --- BMI bins (clinically meaningful) ---
bmi_bins = [-np.inf, 18.5, 25, 30, 35, 40, np.inf]
bmi_labels = ["Underweight", "Normal", "Overweight", "Obese_I", "Obese_II", "Obese_III"]
arm["BMI_bin"] = pd.cut(arm["BMI"], bins=bmi_bins, labels=bmi_labels)

# --- Mental health days bins (0-30 days of poor mental health) ---
mh_bins = [-np.inf, 0, 5, 15, 30]
mh_labels = ["0", "1-5", "6-15", "16-30"]
arm["MentHlth_bin"] = pd.cut(arm["MentHlth"], bins=mh_bins, labels=mh_labels)

# --- Physical health days bins ---
ph_bins = [-np.inf, 0, 5, 15, 30]
ph_labels = ["0", "1-5", "6-15", "16-30"]
arm["PhysHlth_bin"] = pd.cut(arm["PhysHlth"], bins=ph_bins, labels=ph_labels)

# Keep ordinal vars as categorical (labels optional; codes are still interpretable as brackets/levels)
for c in ordinal_features:
    arm[c] = arm[c].astype("category")

# Quick check: show distributions of the binned vars
display(arm["BMI_bin"].value_counts(dropna=False))
display(arm["MentHlth_bin"].value_counts(dropna=False))
display(arm["PhysHlth_bin"].value_counts(dropna=False))

arm[["BMI", "BMI_bin", "MentHlth", "MentHlth_bin", "PhysHlth", "PhysHlth_bin"]].head(10)

BMI_bin
Overweight     81555
Normal         73736
Obese_I        42750
Obese_II       17216
Obese_III      11471
Underweight     3053
Name: count, dtype: int64

MentHlth_bin
0        152623
1-5       40995
6-15      18308
16-30     17855
Name: count, dtype: int64

PhysHlth_bin
0        136877
1-5       46122
16-30     26143
6-15      20639
Name: count, dtype: int64

Unnamed: 0,BMI,BMI_bin,MentHlth,MentHlth_bin,PhysHlth,PhysHlth_bin
0,40.0,Obese_II,18.0,16-30,15.0,6-15
1,25.0,Normal,0.0,0,0.0,0
2,28.0,Overweight,30.0,16-30,30.0,16-30
3,27.0,Overweight,0.0,0,0.0,0
4,24.0,Normal,3.0,1-5,0.0,0
5,25.0,Normal,0.0,0,2.0,1-5
6,30.0,Overweight,0.0,0,14.0,6-15
7,25.0,Normal,0.0,0,0.0,0
8,30.0,Overweight,30.0,16-30,30.0,16-30
9,24.0,Normal,0.0,0,0.0,0


In [7]:
# build basket (boolean item matrix)

# binary items: keep only the positive state as an item (e.g., HighBP=1)
basket_bin = pd.DataFrame(index=arm.index)
for c in binary_features:
    basket_bin[f"{c}=1"] = (arm[c] == 1)

# categorical items: one-hot encode ordinal + binned continuous
cat_cols = ordinal_features + ["BMI_bin", "MentHlth_bin", "PhysHlth_bin"]
basket_cat = pd.get_dummies(arm[cat_cols], prefix=cat_cols, prefix_sep="=")

# target item (RHS): Diabetes=1
basket_target = pd.DataFrame({f"{target}=1": (arm[target] == 1)}, index=arm.index)

# Combine into final basket
basket = pd.concat([basket_bin, basket_cat, basket_target], axis=1).astype(bool)

print("Basket shape:", basket.shape)
print("Example item columns:", basket.columns[:15].tolist())
basket.head()

Basket shape: (229781, 61)
Example item columns: ['HighBP=1', 'HighChol=1', 'CholCheck=1', 'Smoker=1', 'Stroke=1', 'HeartDiseaseorAttack=1', 'PhysActivity=1', 'Fruits=1', 'Veggies=1', 'HvyAlcoholConsump=1', 'AnyHealthcare=1', 'NoDocbcCost=1', 'DiffWalk=1', 'Sex=1', 'GenHlth=1.0']


Unnamed: 0,HighBP=1,HighChol=1,CholCheck=1,Smoker=1,Stroke=1,HeartDiseaseorAttack=1,PhysActivity=1,Fruits=1,Veggies=1,HvyAlcoholConsump=1,AnyHealthcare=1,NoDocbcCost=1,DiffWalk=1,Sex=1,GenHlth=1.0,GenHlth=2.0,GenHlth=3.0,GenHlth=4.0,GenHlth=5.0,Age=1.0,Age=2.0,Age=3.0,Age=4.0,Age=5.0,Age=6.0,Age=7.0,Age=8.0,Age=9.0,Age=10.0,Age=11.0,Age=12.0,Age=13.0,Education=1.0,Education=2.0,Education=3.0,Education=4.0,Education=5.0,Education=6.0,Income=1.0,Income=2.0,Income=3.0,Income=4.0,Income=5.0,Income=6.0,Income=7.0,Income=8.0,BMI_bin=Underweight,BMI_bin=Normal,BMI_bin=Overweight,BMI_bin=Obese_I,BMI_bin=Obese_II,BMI_bin=Obese_III,MentHlth_bin=0,MentHlth_bin=1-5,MentHlth_bin=6-15,MentHlth_bin=16-30,PhysHlth_bin=0,PhysHlth_bin=1-5,PhysHlth_bin=6-15,PhysHlth_bin=16-30,Diabetes=1
0,True,True,True,True,False,False,False,False,True,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False
1,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False
2,True,True,True,False,False,False,False,True,False,False,True,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False
3,True,False,True,False,False,False,True,True,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
4,True,True,True,False,False,False,True,True,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False


In [8]:
#  stratified sampling + Apriori frequent itemsets


rhs_item = f"{target}=1"

# --- Stratified sample indices (keeps Diabetes proportion) ---
n_sample = 50000  # low n as my whole system keeps crashing
pos_idx = basket.index[basket[rhs_item]].to_numpy()
neg_idx = basket.index[~basket[rhs_item]].to_numpy()

# keep original class balance
p = len(pos_idx) / len(basket)
n_pos = int(n_sample * p)
n_neg = n_sample - n_pos

rng = np.random.default_rng(42)
sample_idx = np.concatenate([
    rng.choice(pos_idx, size=n_pos, replace=False),
    rng.choice(neg_idx, size=n_neg, replace=False),
])

basket_s = basket.loc[sample_idx].copy()

print("Sample basket shape:", basket_s.shape)
print(f"Sample P({rhs_item}) =", basket_s[rhs_item].mean().round(4))

# --- Frequent itemsets on sample ---
min_support = 0.02
max_len = 3

freq_itemsets = apriori(
    basket_s,
    min_support=min_support,
    use_colnames=True,
    max_len=max_len
).sort_values("support", ascending=False)

print("Frequent itemsets:", freq_itemsets.shape)
freq_itemsets.head(10)

Sample basket shape: (50000, 61)
Sample P(Diabetes=1) = 0.1729
Frequent itemsets: (4885, 2)


Unnamed: 0,support,itemsets
2,0.96004,(CholCheck=1)
10,0.94602,(AnyHealthcare=1)
168,0.91356,"(CholCheck=1, AnyHealthcare=1)"
8,0.7952,(Veggies=1)
166,0.76382,"(Veggies=1, CholCheck=1)"
392,0.75444,"(Veggies=1, AnyHealthcare=1)"
6,0.7337,(PhysActivity=1)
2015,0.72858,"(Veggies=1, AnyHealthcare=1, CholCheck=1)"
164,0.70398,"(CholCheck=1, PhysActivity=1)"
295,0.6962,"(AnyHealthcare=1, PhysActivity=1)"


In [9]:
# association rules (filter to consequent Diabetes=1)


rhs_item = f"{target}=1"

rules = association_rules(freq_itemsets, metric="confidence", min_threshold=0.2)

# Keep only rules where consequent is exactly {Diabetes=1}
rules_diab = rules[rules["consequents"].apply(lambda s: (len(s) == 1) and (rhs_item in s))].copy()

# Add lengths for filtering
rules_diab["antecedent_len"] = rules_diab["antecedents"].apply(len)

# Filter to reduce noise / triviality
rules_diab_f = rules_diab[
    (rules_diab["support"] >= 0.01) &
    (rules_diab["lift"] >= 1.2) &
    (rules_diab["antecedent_len"] <= 3)
].sort_values(["lift", "confidence", "support"], ascending=False)

print("All diabetes rules:", rules_diab.shape)
print("Filtered diabetes rules:", rules_diab_f.shape)

rules_diab_f.head(15)[["antecedents", "consequents", "support", "confidence", "lift"]]

All diabetes rules: (132, 15)
Filtered diabetes rules: (124, 15)


Unnamed: 0,antecedents,consequents,support,confidence,lift
14325,"(HeartDiseaseorAttack=1, DiffWalk=1)",(Diabetes=1),0.02012,0.464665,2.68779
9829,"(GenHlth=4.0, HighChol=1)",(Diabetes=1),0.03292,0.422919,2.446314
8866,"(GenHlth=4.0, HighBP=1)",(Diabetes=1),0.03746,0.419203,2.424822
7762,"(DiffWalk=1, HighChol=1)",(Diabetes=1),0.04414,0.416494,2.409149
14090,"(CholCheck=1, GenHlth=5.0)",(Diabetes=1),0.02062,0.408641,2.363723
6905,"(DiffWalk=1, HighBP=1)",(Diabetes=1),0.05072,0.408242,2.361419
14023,(GenHlth=5.0),(Diabetes=1),0.02076,0.402794,2.329905
12016,"(PhysHlth_bin=16-30, HighChol=1)",(Diabetes=1),0.02542,0.400946,2.319218
10257,"(HeartDiseaseorAttack=1, HighBP=1)",(Diabetes=1),0.03128,0.398776,2.306664
11109,"(PhysHlth_bin=16-30, HighBP=1)",(Diabetes=1),0.02794,0.398573,2.305492


In [10]:
# format top rules for report ready

def pretty_itemset(s):
    return ", ".join(sorted(list(s)))

report_rules = rules_diab_f.copy()
report_rules["Rule (Antecedent → Diabetes)"] = report_rules["antecedents"].apply(pretty_itemset) + "  →  " + rhs_item

# Select + round for readability
top_rules = report_rules[
    ["Rule (Antecedent → Diabetes)", "support", "confidence", "lift", "antecedent_len"]
].head(12).copy()

top_rules["support"] = top_rules["support"].round(4)
top_rules["confidence"] = top_rules["confidence"].round(3)
top_rules["lift"] = top_rules["lift"].round(2)

display(top_rules)

Unnamed: 0,Rule (Antecedent → Diabetes),support,confidence,lift,antecedent_len
14325,"DiffWalk=1, HeartDiseaseorAttack=1 → Diabetes=1",0.0201,0.465,2.69,2
9829,"GenHlth=4.0, HighChol=1 → Diabetes=1",0.0329,0.423,2.45,2
8866,"GenHlth=4.0, HighBP=1 → Diabetes=1",0.0375,0.419,2.42,2
7762,"DiffWalk=1, HighChol=1 → Diabetes=1",0.0441,0.416,2.41,2
14090,"CholCheck=1, GenHlth=5.0 → Diabetes=1",0.0206,0.409,2.36,2
6905,"DiffWalk=1, HighBP=1 → Diabetes=1",0.0507,0.408,2.36,2
14023,GenHlth=5.0 → Diabetes=1,0.0208,0.403,2.33,1
12016,"HighChol=1, PhysHlth_bin=16-30 → Diabetes=1",0.0254,0.401,2.32,2
10257,"HeartDiseaseorAttack=1, HighBP=1 → Diabetes=1",0.0313,0.399,2.31,2
11109,"HighBP=1, PhysHlth_bin=16-30 → Diabetes=1",0.0279,0.399,2.31,2


In [11]:
# save top rules table (appendix-ready)

out_path = PROJECT_ROOT / "data" / "processed" / "ARM_Top_Rules.csv"
top_rules.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: /Users/dawoodbutt/Desktop/10. UNI/4. Data Mining/individual assessment/diabetes-risk/data/processed/ARM_Top_Rules.csv


In [12]:
# Cell 10: add counts for interpretability

N = len(basket_s)
top_rules2 = top_rules.copy()
top_rules2.insert(1, "count_in_sample", (top_rules2["support"] * N).round().astype(int))

display(top_rules2)

Unnamed: 0,Rule (Antecedent → Diabetes),count_in_sample,support,confidence,lift,antecedent_len
14325,"DiffWalk=1, HeartDiseaseorAttack=1 → Diabetes=1",1005,0.0201,0.465,2.69,2
9829,"GenHlth=4.0, HighChol=1 → Diabetes=1",1645,0.0329,0.423,2.45,2
8866,"GenHlth=4.0, HighBP=1 → Diabetes=1",1875,0.0375,0.419,2.42,2
7762,"DiffWalk=1, HighChol=1 → Diabetes=1",2205,0.0441,0.416,2.41,2
14090,"CholCheck=1, GenHlth=5.0 → Diabetes=1",1030,0.0206,0.409,2.36,2
6905,"DiffWalk=1, HighBP=1 → Diabetes=1",2535,0.0507,0.408,2.36,2
14023,GenHlth=5.0 → Diabetes=1,1040,0.0208,0.403,2.33,1
12016,"HighChol=1, PhysHlth_bin=16-30 → Diabetes=1",1270,0.0254,0.401,2.32,2
10257,"HeartDiseaseorAttack=1, HighBP=1 → Diabetes=1",1565,0.0313,0.399,2.31,2
11109,"HighBP=1, PhysHlth_bin=16-30 → Diabetes=1",1395,0.0279,0.399,2.31,2
