In [1]:
import pandas as pd
from tqdm import tqdm
import imodels
from imodels import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
df = pd.read_csv('../data/ICPSR_03986/DS0001/data_clean.csv')
X, y = df.drop(['id', 'any_deviance'], axis=1), df['any_deviance']

# different models select different rules

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
models = [
    BoostedRulesClassifier(n_estimators=5),
    RuleFitClassifier(n_estimators=10, tree_size=3, max_rules=5),
    DecisionTreeClassifier(max_depth=3)
]
for m in tqdm(models):
    try:
        m.fit(X_train, y_train, feature_names=X_train.columns)
    except:
        m.fit(X_train, y_train)
    print(m, 'acc', accuracy_score(y_test, m.predict(X_test)))
    try:
        print('\t', m.rules_)
    except:
        pass

  0%|          | 0/3 [00:00<?, ?it/s]

Mined rules:
	you_gang_fight <= 1.5
	you_broken_+_entering <= 1.5
	any_victimization <= 0.5
	you_attack_intent_kill/maim <= 1.5
	fr_suggest_agnts_law <= 1.5
 acc 0.9428571428571428
	 [you_gang_fight <= 1.5, you_gang_fight > 1.5, you_broken_+_entering <= 1.5, you_broken_+_entering > 1.5, any_victimization <= 0.5, any_victimization > 0.5, you_attack_intent_kill/maim <= 1.5, you_attack_intent_kill/maim > 1.5, fr_suggest_agnts_law <= 1.5, fr_suggest_agnts_law > 1.5]


100%|██████████| 3/3 [08:24<00:00, 168.23s/it]

RuleFitClassifier(max_rules=5, n_estimators=10, tree_size=3) acc 0.987912087912088
	 [you_attack_intent_kill/maim > 1.5 and you_broken_+_entering > 1.5 and you_gang_fight > 1.5 and you_strong-arm_robbery > 1.5]
DecisionTreeClassifier(max_depth=3) acc 0.9747252747252747





# the same model with slight bootstrap sampling selects different rules

In [5]:
np.random.seed(13)
for i in tqdm(range(3)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43 + i)
    m = imodels.BayesianRuleListClassifier(max_iter=10, minsupport=0.2, random_state=43 + i)
    m.fit(X_train, y_train, feature_names=X_train.columns)
    print('acc', accuracy_score(y_test, m.predict(X_test)))
    print('rules', print(m))

  0%|          | 0/3 [04:52<?, ?it/s]


ValueError: attempt to get argmax of an empty sequence