In [12]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import pickle
import numpy as np

In [2]:
df = pd.read_pickle("../data/processed.pkl")
categorical_columns = df.select_dtypes(include=['category']).columns

In [3]:
with open("../data/data_element_details.pkl", "rb") as file:
    data_element_details = pickle.load(file)

In [50]:
section_dict = {
    "Chronic Health Conditions": [], 
    "Social Determinants and Health Equity": [],
    "Disability": [],
    "Sexual Orientation and Gender Identity (SOGI)": []
}

keep_cols = []
for value in data_element_details.values():
    column_name = f"{value['Label']} ({value['SAS Variable Name']})"
    if value['Section Name'] in section_dict.keys() and column_name in df.columns:
        for key, valid_values in value['Valid Values'].items():
            v = f"{column_name}:{key} - {valid_values['Value Label']}"
            section_dict[value['Section Name']].append( frozenset({v}) )
        keep_cols.append(column_name)
df_a = df[keep_cols]

SyntaxError: ':' expected after dictionary key (1973357468.py, line 4)

In [47]:
from sklearn.tree import DecisionTreeClassifier , export_text, _tree
from sklearn import tree
import matplotlib.pyplot as plt

X = df.drop('(Ever told) you had a depressive disorder (ADDEPEV3)', axis=1)
X = pd.get_dummies(X)
y = df['(Ever told) you had a depressive disorder (ADDEPEV3)']

# Create decision tree classifier object
clf = DecisionTreeClassifier(random_state=0, max_depth=30)

# Train decision tree classifier
model = clf.fit(X, y)

decision_rules = export_text(clf, feature_names=X.columns)
print(decision_rules)



|--- Difficulty Concentrating or Remembering (DECIDE)_1 - Yes <= 0.50
|   |--- How often have you felt this kind of stress? (SDHSTRE1)_5 - Never <= 0.50
|   |   |--- Are you male or female? (CELLSEX1)_2 - FemaleGo to CP.06, PVTRESD3 <= 0.50
|   |   |   |--- How often do you feel socially isolated from others? (SDHISOLT)_3 - Sometimes <= 0.50
|   |   |   |   |--- How often do you feel socially isolated from others? (SDHISOLT)_2 - Usually <= 0.50
|   |   |   |   |   |--- Employment Status (EMPLOY1)_8 - Unable to work <= 0.50
|   |   |   |   |   |   |--- Ever Told Had Asthma (ASTHMA3)_2 - NoGo to Section 07.06 CHCSCNC1 <= 0.50
|   |   |   |   |   |   |   |--- Ever told you have kidney disease? (CHCKDNY2)_9 - Refused <= 0.50
|   |   |   |   |   |   |   |   |--- Month and Year of Last HIV Test (HIVTSTD3)_BLANK - Not asked or Missing <= 0.50
|   |   |   |   |   |   |   |   |   |--- Difficulty Doing Errands Alone (DIFFALON)_2 - No <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- Difficu

In [43]:
def get_rules(tree, feature_names, class_names, target_label, min_samples=1):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # calculate probabilities for each path
    probabilities = []
    for path in paths:
        classes = path[-1][0][0]
        l = np.argmax(classes)
        prob = np.round(100.0*classes[l]/np.sum(classes), 2)
        samples = path[-1][1]
        if samples >= min_samples:
            probabilities.append((prob, path))
    
    # sort by probabilities in descending order
    probabilities.sort(reverse=True)
    paths = [p[1] for p in probabilities]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        if class_names[l] == target_label:
            rules += [rule]
        
    return rules


In [39]:
y.drop_duplicates()

0                             2 - No
20                           1 - Yes
400                      9 - Refused
661          7 - Dont know/Not sure
1301    BLANK - Not asked or Missing
Name: (Ever told) you had a depressive disorder (ADDEPEV3), dtype: category
Categories (5, object): ['1 - Yes', '2 - No', '7 - Dont know/Not sure', '9 - Refused', 'BLANK - Not asked or Missing']

In [49]:
rules = get_rules(clf, X.columns, ['1 - Yes', '2 - No', '7 - Dont know/Not sure', '9 - Refused', 'BLANK - Not asked or Missing'], "1 - Yes", 200)
for r in rules:
    print(r)

if (Difficulty Concentrating or Remembering (DECIDE)_1 - Yes > 0.5) and (How often have you felt this kind of stress? (SDHSTRE1)_5 - Never <= 0.5) and (Difficulty Doing Errands Alone (DIFFALON)_1 - Yes <= 0.5) and (Are you male or female? (CELLSEX1)_2 - FemaleGo to CP.06, PVTRESD3 > 0.5) and (Language identifier (QSTLANG)_2 - Spanish <= 0.5) and (Ever Told Had Asthma (ASTHMA3)_2 - NoGo to Section 07.06 CHCSCNC1 > 0.5) and (Ever tested H.I.V. (HIVTST7)_1 - Yes > 0.5) and (Have Personal Health Care Provider? (PERSDOC3)_3 - No <= 0.5) and (Ever told you had C.O.P.D. emphysema or chronic bronchitis? (CHCCOPD3)_2 - No <= 0.5) and (Ever had any other kind of test for colorectal cancer (COLNCNCR)_7 - Dont know/Not SureGo to Next Section <= 0.5) and (Relationship to child (RCSRLTN2)_2 - Grandparent <= 0.5) and (What did you do to keep you from getting pregnant? (TYPCNTR9)_10 - Withdrawal or pulling out <= 0.5) and (Education Level (EDUCA)_6 - College 4 years or more (College graduate) <= 0

### Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

X = df.drop('(Ever told) you had a depressive disorder (ADDEPEV3)', axis=1)
X = pd.get_dummies(X)
y = df['(Ever told) you had a depressive disorder (ADDEPEV3)']

# Encode target variable 
y = y.map({'1 - Yes': 1, '2 - No': 0, '7 - Dont know/Not sure': 2, '9 - Refused': 3, 'BLANK - Not asked or Missing': -1})

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier with 100 trees
rf = RandomForestClassifier(n_estimators=100, random_state=42) 

# Train the model
rf.fit(X_train, y_train)

# Make predictions on test set
y_pred = rf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Get feature importances
importances = rf.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. %s (%f)" % (f + 1, X.columns[indices[f]], importances[indices[f]]))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         2
           0       0.83      0.97      0.90     70204
           1       0.72      0.27      0.40     18278
           2       0.00      0.00      0.00       429
           3       0.95      0.34      0.50       114

    accuracy                           0.82     89027
   macro avg       0.50      0.32      0.36     89027
weighted avg       0.81      0.82      0.79     89027

Feature ranking:
1. Difficulty Concentrating or Remembering (DECIDE)_1 - Yes (0.023177)
2. Difficulty Concentrating or Remembering (DECIDE)_2 - No (0.018654)
3. Difficulty Doing Errands Alone (DIFFALON)_1 - Yes (0.007170)
4. Number of Adults in Household (HHADULT) (0.007078)
5. How often have you felt this kind of stress? (SDHSTRE1)_5 - Never (0.006915)
6. How often have you felt this kind of stress? (SDHSTRE1)_1 - Always (0.005814)
7. How often have you felt this kind of stress? (SDHSTRE1)_2 - Usually (0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
