In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# save dataset to huggingface

In [4]:
from sklearn.model_selection import train_test_split
import imodels
import pandas as pd


X, y, feature_names = imodels.get_clean_dataset('compas_two_year_clean')

df = pd.DataFrame(X, columns=feature_names)
df['is_recid'] = y
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

# fit sample model

In [10]:
from datasets import load_dataset
import imodels
import numpy as np
from sklearn.model_selection import GridSearchCV
import joblib

# load data
dataset = load_dataset("imodels/compas-recidivism")
df = pd.DataFrame(dataset['train'])
X_train = df.drop(columns=['is_recid'])
y_train = df['is_recid'].values

df_test = pd.DataFrame(dataset['test'])
X_test = df.drop(columns=['is_recid'])
y_test = df['is_recid'].values

Using custom data configuration imodels--compas-recidivism-73f759ec546906e5
Reusing dataset csv (/Users/chandan/.cache/huggingface/datasets/imodels___csv/imodels--compas-recidivism-73f759ec546906e5/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/2 [00:00<?, ?it/s]

**fit single model**

In [11]:
m = imodels.FIGSClassifier(max_rules=5)
m.fit(X_train, y_train)
print(m)

> ------------------------------
> FIGS-Fast Interpretable Greedy-Tree Sums:
> 	Predictions are made by summing the "Val" reached by traversing each tree
> ------------------------------
priors_count <= 2.500 (Tree #0 root)
	priors_count <= 0.500 (split)
		Val: 0.288 (leaf)
		Val: 0.440 (leaf)
	priors_count <= 6.500 (split)
		Val: 0.619 (leaf)
		Val: 0.804 (leaf)

	+
age <= 27.500 (Tree #1 root)
	age <= 22.500 (split)
		Val: 0.256 (leaf)
		Val: 0.076 (leaf)
	Val: -0.069 (leaf)



In [12]:
print('accuracy', np.mean(m.predict(X_test) == y_test))

accuracy 0.6813854567551144


**fit gridsearch model**

In [39]:
np.random.seed(13)
params = {'max_rules': [3, 4, 5, 7, 10]}
grid = GridSearchCV(m, param_grid=params, cv=3)
grid.fit(X_train, y_train)

print(f"score = {grid.score(X_test, y_test):3.2f}")
print('best params', grid.best_params_)

score = 0.68
best params {'max_rules': 4}


In [40]:
joblib.dump(grid.best_estimator_, 'sklearn_model.joblib')

['sklearn_model.joblib']

In [41]:
clf = joblib.load('sklearn_model.joblib')
print('accuracy', np.mean(clf.predict(X_test) == y_test))

accuracy 0.6759165485112416


**prepare example**

In [35]:
x = X_test.iloc[0]
ind = x.index
vals = x.values
s = 'widget:\n'
s+= '  structuredData:\n'
for i in range(x.shape[0]):
    s+= '    ' + ind[i]+':\n'
    for j in range(5):
        s+= '      - ' + str(X_test.iloc[j][i]) + '\n'
with open('widget.txt', 'w') as f:
    f.write(s)