In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import pickle
import numpy as np

In [2]:
df = pd.read_pickle("../data/processed.pkl")
categorical_columns = df.select_dtypes(include=['category']).columns

In [3]:
with open("../data/data_element_details.pkl", "rb") as file:
    data_element_details = pickle.load(file)

In [4]:
section_dict = {
    "Chronic Health Conditions": [], 
    "Social Determinants and Health Equity": [],
    "Disability": [],
    "Sexual Orientation and Gender Identity (SOGI)": []
}

keep_cols = []
for value in data_element_details.values():
    column_name = f"{value['Label']} ({value['SAS Variable Name']})"
    if value['Section Name'] in section_dict.keys() and column_name in df.columns:
        for key, valid_values in value['Valid Values'].items():
            v = f"{column_name}:{key} - {valid_values['Value Label']}"
            section_dict[value['Section Name']].append( frozenset({v}) )
        keep_cols.append(column_name)
df_a = df[keep_cols]
df_a = df_a[df_a['(Ever told) you had a depressive disorder (ADDEPEV3)'].isin(['1 - Yes', '2 - No'])]

In [5]:
from sklearn.tree import DecisionTreeClassifier , export_text, _tree
from sklearn import tree
import matplotlib.pyplot as plt

X = df_a.drop('(Ever told) you had a depressive disorder (ADDEPEV3)', axis=1)
X = pd.get_dummies(X)
y = df_a['(Ever told) you had a depressive disorder (ADDEPEV3)']

# Create decision tree classifier object
clf = DecisionTreeClassifier(random_state=0, max_depth=10)

# Train decision tree classifier
model = clf.fit(X, y)

decision_rules = export_text(clf, feature_names=X.columns)
print(decision_rules)



|--- Difficulty Concentrating or Remembering (DECIDE)_1 - Yes <= 0.50
|   |--- How often have you felt this kind of stress? (SDHSTRE1)_5 - Never <= 0.50
|   |   |--- Ever Told Had Asthma (ASTHMA3)_2 - NoGo to Section 07.06 CHCSCNC1 <= 0.50
|   |   |   |--- How often have you felt this kind of stress? (SDHSTRE1)_2 - Usually <= 0.50
|   |   |   |   |--- How often have you felt this kind of stress? (SDHSTRE1)_1 - Always <= 0.50
|   |   |   |   |   |--- Difficulty Doing Errands Alone (DIFFALON)_1 - Yes <= 0.50
|   |   |   |   |   |   |--- Sexual orientation (SOMALE)_2 - Straight, that is, not gay <= 0.50
|   |   |   |   |   |   |   |--- How often have you felt this kind of stress? (SDHSTRE1)_3 - Sometimes <= 0.50
|   |   |   |   |   |   |   |   |--- Ever told you had C.O.P.D. emphysema or chronic bronchitis? (CHCCOPD3)_1 - Yes <= 0.50
|   |   |   |   |   |   |   |   |   |--- Sexual orientation (SOFEMALE)_3 - Bisexual <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 2 - No
|   |

In [6]:
def get_rules(tree, feature_names, class_names, target_label, min_samples=1):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # calculate probabilities for each path
    probabilities = []
    for path in paths:
        classes = path[-1][0][0]
        l = np.argmax(classes)
        prob = np.round(100.0*classes[l]/np.sum(classes), 2)
        samples = path[-1][1]
        if samples >= min_samples:
            probabilities.append((prob, path))
    
    # sort by probabilities in descending order
    probabilities.sort(reverse=True)
    paths = [p[1] for p in probabilities]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        if class_names[l] == target_label:
            rules += [rule]
        
    return rules


In [7]:
y.drop_duplicates()

0      2 - No
20    1 - Yes
Name: (Ever told) you had a depressive disorder (ADDEPEV3), dtype: category
Categories (5, object): ['1 - Yes', '2 - No', '7 - Dont know/Not sure', '9 - Refused', 'BLANK - Not asked or Missing']

In [8]:
rules = get_rules(clf, X.columns, ['1 - Yes', '2 - No', '7 - Dont know/Not sure', '9 - Refused', 'BLANK - Not asked or Missing'], "1 - Yes", 500)
for r in rules:
    print(r)

if (Difficulty Concentrating or Remembering (DECIDE)_1 - Yes > 0.5) and (How often have you felt this kind of stress? (SDHSTRE1)_5 - Never <= 0.5) and (Difficulty Doing Errands Alone (DIFFALON)_2 - No <= 0.5) and (How often have you felt this kind of stress? (SDHSTRE1)_1 - Always <= 0.5) and (How often have you felt this kind of stress? (SDHSTRE1)_4 - Rarely <= 0.5) and (Still Have Asthma (ASTHNOW)_BLANK - Not asked or Missing > 0.5) and (How often have you felt this kind of stress? (SDHSTRE1)_2 - Usually > 0.5) and (Sexual orientation (SOMALE)_2 - Straight, that is, not gay <= 0.5) and (Blind or Difficulty seeing (BLIND)_2 - No > 0.5) and (How often do you feel socially isolated from others? (SDHISOLT)_2 - Usually <= 0.5) then class: 1 - Yes (proba: 82.79%) | based on 639 samples
if (Difficulty Concentrating or Remembering (DECIDE)_1 - Yes > 0.5) and (How often have you felt this kind of stress? (SDHSTRE1)_5 - Never <= 0.5) and (Difficulty Doing Errands Alone (DIFFALON)_2 - No <= 0.5)