# Setup a classification experiment

In [3]:
!pip install numpy scipy pyscaffold
!pip install -U interpret

Collecting pyscaffold
[?25l  Downloading https://files.pythonhosted.org/packages/d3/3f/0ce77998683cb7967ba7d98b114b8a6a954a731b812f455dee57f1636853/PyScaffold-3.1-py3-none-any.whl (163kB)
[K     |████████████████████████████████| 174kB 2.8MB/s 
Installing collected packages: pyscaffold
Successfully installed pyscaffold-3.1
Collecting interpret
[?25l  Downloading https://files.pythonhosted.org/packages/3b/44/ddf99b04c3b61a4329bfc2aab25dfb535277a4d11b90c17b8abedf9de301/interpret-0.1.0-py3-none-any.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 2.7MB/s 
[?25hCollecting lime>=0.1.1.33 (from interpret)
[?25l  Downloading https://files.pythonhosted.org/packages/80/6b/f446bbd9d2f1b2c721266ca5070c4c0b8235623c913ed4640562062853c6/lime-0.1.1.33.tar.gz (272kB)
[K     |████████████████████████████████| 276kB 39.4MB/s 
Collecting SALib>=1.3.3 (from interpret)
[?25l  Downloading https://files.pythonhosted.org/packages/b8/80/fbaeda49507943c87c5e2fa68b987cc0da9334b3e5624d601b4017a

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == " <=50K" else 1) #Turning response into 0 and 1

seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

# Explore the dataset

In [5]:
from interpret import show
from interpret.data import ClassHistogram

hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

# Train the Explainable Boosting Machine (EBM)

In [9]:
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

ebm = ExplainableBoostingClassifier(random_state=seed,)
ebm.fit(X_train, y_train)   #Works on dataframes and numpy arrays

ExplainableBoostingClassifier(data_n_episodes=2000,
               early_stopping_run_length=50,
               early_stopping_tolerance=1e-05,
               feature_names=['Age', 'WorkClass', 'fnlwgt', 'Education', 'EducationNum', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry'],
               feature_step_n_inner_bags=0,
               feature_types=['continuous', 'categorical', 'continuous', 'categorical', 'continuous', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'continuous', 'continuous', 'continuous', 'categorical'],
               holdout_size=0.15, holdout_split=0.15, interactions=0,
               learning_rate=0.01, max_tree_splits=2,
               min_cases_for_splits=2, n_estimators=16, n_jobs=-2,
               random_state=1, schema=None, scoring=None,
               training_step_episodes=1)

# Global Explanations: What the model learned overall

In [0]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

# Local Explanations: How an individual prediction was made

In [0]:
ebm_local = ebm.explain_local(X_test[:5], y_test[:5], name='EBM')
show(ebm_local)

# Evaluate EBM performance

In [0]:
from interpret.perf import ROC

ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

## Let's test out a few other Explainable Models

In [0]:
from interpret.glassbox import LogisticRegression, ClassificationTree

# We have to transform categorical variables to use Logistic Regression and Decision Tree
X_enc = pd.get_dummies(X, prefix_sep='.')
feature_names = list(X_enc.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_enc, y, test_size=0.20, random_state=seed)

lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty='l1')
lr.fit(X_train_enc, y_train)

tree = ClassificationTree()
tree.fit(X_train_enc, y_train)

## Compare performance using the Dashboard

In [0]:
lr_perf = ROC(lr.predict_proba).explain_perf(X_test_enc, y_test, name='Logistic Regression')
tree_perf = ROC(tree.predict_proba).explain_perf(X_test_enc, y_test, name='Classification Tree')

show(lr_perf)
show(tree_perf)
show(ebm_perf)

## Glassbox: All of our models have global and local explanations

In [0]:
lr_global = lr.explain_global(name='LR')
tree_global = tree.explain_global(name='Tree')

show(lr_global)
show(tree_global)
show(ebm_global)

## Dashboard: look at everything at once

In [0]:
# Do everything in one shot with the InterpretML Dashboard by passing a list into show

show([hist, lr_global, lr_perf, tree_global, tree_perf, ebm_global, ebm_perf], share_tables=True)