In [1]:
%load_ext autoreload
%autoreload 2

# save dataset to huggingface

In [9]:
from sklearn.model_selection import train_test_split
import imodels
import pandas as pd


outcome = 'default.payment.next.month'
X, y, feature_names = imodels.get_clean_dataset('credit_card_clean')
df = pd.DataFrame(X, columns=feature_names)
df[outcome] = y


train, test = train_test_split(df, test_size=0.2, random_state=42)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

# fit sample model

In [11]:
from datasets import load_dataset
import imodels
import numpy as np
from sklearn.model_selection import GridSearchCV
import joblib

# load data
outcome = 'default.payment.next.month'
dataset = load_dataset("imodels/credit-card")
df = pd.DataFrame(dataset['train'])
X_train = df.drop(columns=[outcome])
y_train = df[outcome].values

df_test = pd.DataFrame(dataset['test'])
X_test = df.drop(columns=[outcome])
y_test = df[outcome].values

Using custom data configuration imodels--credit-card-af8eb95f9964592b


Downloading and preparing dataset csv/imodels--credit-card to /Users/chandan/.cache/huggingface/datasets/imodels___csv/imodels--credit-card-af8eb95f9964592b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /Users/chandan/.cache/huggingface/datasets/imodels___csv/imodels--credit-card-af8eb95f9964592b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

**fit single model**

In [12]:
m = imodels.FIGSClassifier(max_rules=5)
m.fit(X_train, y_train)
print(m)

> ------------------------------
> FIGS-Fast Interpretable Greedy-Tree Sums:
> 	Predictions are made by summing the "Val" reached by traversing each tree
> ------------------------------
pay_0 <= 1.500 (Tree #0 root)
	pay_2 <= 1.500 (split)
		pay_amt3 <= 678.500 (split)
			bill_amt1 <= 530.000 (split)
				Val: 0.285 (leaf)
				Val: 0.183 (leaf)
			Val: 0.115 (leaf)
		Val: 0.420 (leaf)
	Val: 0.697 (leaf)

	+
pay_5 <= 1.000 (Tree #1 root)
	Val: -0.011 (leaf)
	Val: 0.102 (leaf)



In [13]:
print('accuracy', np.mean(m.predict(X_test) == y_test))

accuracy 0.8205


**fit gridsearch model**

In [39]:
np.random.seed(13)
params = {'max_rules': [3, 4, 5, 7, 10]}
grid = GridSearchCV(m, param_grid=params, cv=3)
grid.fit(X_train, y_train)

print(f"score = {grid.score(X_test, y_test):3.2f}")
print('best params', grid.best_params_)

score = 0.68
best params {'max_rules': 4}


In [40]:
joblib.dump(grid.best_estimator_, 'sklearn_model.joblib')

['sklearn_model.joblib']

In [41]:
clf = joblib.load('sklearn_model.joblib')
print('accuracy', np.mean(clf.predict(X_test) == y_test))

accuracy 0.6759165485112416


**prepare example**

In [35]:
x = X_test.iloc[0]
ind = x.index
vals = x.values
s = 'widget:\n'
s+= '  structuredData:\n'
for i in range(x.shape[0]):
    s+= '    ' + ind[i]+':\n'
    for j in range(5):
        s+= '      - ' + str(X_test.iloc[j][i]) + '\n'
with open('widget.txt', 'w') as f:
    f.write(s)