In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import pickle

In [2]:
df = pd.read_pickle("../data/processed.pkl")
categorical_columns = df.select_dtypes(include=['category']).columns

In [3]:
with open("../data/data_element_details.pkl", "rb") as file:
    data_element_details = pickle.load(file)

## df_a - Mental Health & Chronic Conditions

In [4]:
section_dict = {
    "Chronic Health Conditions": [], 
    # "Social Determinants and Health Equity": []
}

keep_cols = []
for value in data_element_details.values():
    column_name = f"{value['Label']} ({value['SAS Variable Name']})"
    if value['Section Name'] in section_dict.keys() and column_name in df.columns:
        for key, valid_values in value['Valid Values'].items():
            v = f"{column_name}:{key} - {valid_values['Value Label']}"
            section_dict[value['Section Name']].append( frozenset({v}) )
        keep_cols.append(column_name)
df_a = df[keep_cols]

In [5]:
# df = df[categorical_columns]
transactions = []
for _, row in df_a.iterrows():
    transaction = [f"{col}:{val}" for col, val in zip(df_a.columns, row) if pd.notna(val) and not col.startswith("BLANK")]
    transactions.append(transaction)

In [6]:
# Encode the transactions
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
encoded_df = pd.DataFrame(te_ary, columns=te.columns_)

In [7]:
# Apply FP-Growth algorithm
frequent_itemsets = fpgrowth(encoded_df, min_support=0.005, use_colnames=True, max_len=5)

In [8]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.01)

In [9]:
chronic_health_conditions = section_dict["Chronic Health Conditions"]
chronic_health_conditions

[frozenset({'Ever Diagnosed with Heart Attack (CVDINFR4):1 - Yes'}),
 frozenset({'Ever Diagnosed with Heart Attack (CVDINFR4):2 - No'}),
 frozenset({'Ever Diagnosed with Heart Attack (CVDINFR4):7 - Don\x92t know/Not sure'}),
 frozenset({'Ever Diagnosed with Heart Attack (CVDINFR4):9 - Refused'}),
 frozenset({'Ever Diagnosed with Heart Attack (CVDINFR4):BLANK - Not asked or Missing'}),
 frozenset({'Ever Diagnosed with Angina or Coronary Heart Disease (CVDCRHD4):1 - Yes'}),
 frozenset({'Ever Diagnosed with Angina or Coronary Heart Disease (CVDCRHD4):2 - No'}),
 frozenset({'Ever Diagnosed with Angina or Coronary Heart Disease (CVDCRHD4):7 - Don\x92t know/Not sure'}),
 frozenset({'Ever Diagnosed with Angina or Coronary Heart Disease (CVDCRHD4):9 - Refused'}),
 frozenset({'Ever Diagnosed with Angina or Coronary Heart Disease (CVDCRHD4):BLANK - Not asked or Missing'}),
 frozenset({'Ever Diagnosed with a Stroke (CVDSTRK3):1 - Yes'}),
 frozenset({'Ever Diagnosed with a Stroke (CVDSTRK3):2 - No

In [10]:
chronic_health_conditions_list = [
    frozenset({'Ever Diagnosed with Heart Attack (CVDINFR4):1 - Yes'}),
    frozenset({'Ever Diagnosed with Angina or Coronary Heart Disease (CVDCRHD4):1 - Yes'}),
    frozenset({'Ever Diagnosed with a Stroke (CVDSTRK3):1 - Yes'}),
    frozenset({'Ever Told Had Asthma (ASTHMA3):1 - Yes'}),
    frozenset({'Still Have Asthma (ASTHNOW):1 - Yes'}),
    frozenset({'(Ever told) (you had) skin cancer that is not melanoma? (CHCSCNC1):1 - Yes'}),
    frozenset({'(Ever told) (you had)  melanoma or any other types of cancer? (CHCOCNC1):1 - Yes'}),
    frozenset({'Ever told you had C.O.P.D. emphysema or chronic bronchitis? (CHCCOPD3):1 - Yes'}),
    frozenset({'Ever told you have kidney disease? (CHCKDNY2):1 - Yes'}),
    frozenset({'Told Had Arthritis (HAVARTH4):1 - Yes'}),
    frozenset({'(Ever told) you had diabetes (DIABETE4):1 - Yes'}),
]

In [11]:
pd.set_option('max_colwidth', 400)
filtered_rules = rules[
    rules['antecedents'].apply(lambda x: any(item.issubset(x) for item in chronic_health_conditions_list))
    &                 
    (rules['consequents'] == frozenset({'(Ever told) you had a depressive disorder (ADDEPEV3):1 - Yes'}))
    # &
    # (rules['support'] > .1)
].sort_values("confidence", ascending=False)
filtered_rules.to_csv('filtered_rules.csv')
len(filtered_rules)

2115