This notebook shows some examples of fitting different models to classification/regression datasets. We start by loading some classifiers / regressors from `imodels`.

In [106]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn import metrics
import sklearn.preprocessing
import imodels
from imodels.util.data_util import DSET_CLASSIFICATION_KWARGS

print(list(DSET_CLASSIFICATION_KWARGS.keys()))
# installable with: `pip install imodels`
np.random.seed(13)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
['pima_diabetes', 'sonar', 'heart', 'diabetes', 'breast_cancer_recurrence', 'breast_cancer_wisconsin', 'credit_g', 'juvenile', 'compas', 'fico', 'readmission', 'adult', 'csi_pecarn', 'iai_pecarn', 'tbi_pecarn']


In [108]:
X, y, feature_names = imodels.get_clean_dataset("pima_diabetes")

print("shapes", X.shape, y.shape, "nunique", np.unique(y).size)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)

m = sklearn.preprocessing.StandardScaler()
X_train = m.fit_transform(X_train)
X_test = m.transform(X_test)

m = imodels.AutoInterpretableClassifier()
m.fit(X_train, y_train, cv=3)

print("best params", m.est_.best_params_)
print("best score", m.est_.best_score_)
print("best estimator", m.est_.best_estimator_)
print("best estimator params", m.est_.best_estimator_.get_params())

fetching 40715 from openml
shapes (768, 8) (768,) nunique 2


  warn(


best params {'est': LogisticRegression(C=0.1, l1_ratio=0.5, penalty='elasticnet', solver='saga'), 'est__C': 0.1, 'est__l1_ratio': 0.5}
best score 0.8267026097959889
best estimator Pipeline(steps=[('est',
                 LogisticRegression(C=0.1, l1_ratio=0.5, penalty='elasticnet',
                                    solver='saga'))])
best estimator params {'memory': None, 'steps': [('est', LogisticRegression(C=0.1, l1_ratio=0.5, penalty='elasticnet', solver='saga'))], 'verbose': False, 'est': LogisticRegression(C=0.1, l1_ratio=0.5, penalty='elasticnet', solver='saga'), 'est__C': 0.1, 'est__class_weight': None, 'est__dual': False, 'est__fit_intercept': True, 'est__intercept_scaling': 1, 'est__l1_ratio': 0.5, 'est__max_iter': 100, 'est__multi_class': 'auto', 'est__n_jobs': None, 'est__penalty': 'elasticnet', 'est__random_state': None, 'est__solver': 'saga', 'est__tol': 0.0001, 'est__verbose': 0, 'est__warm_start': False}


In [104]:
df = pd.DataFrame(m.est_.cv_results_).sort_values("rank_test_score")
first_cols = ["rank_test_score", "mean_test_score", "std_test_score"]
df = df[first_cols + [c for c in df.columns if c not in first_cols]].round(3)
# remove std_ cols
df = df[[c for c in df.columns if "std_" not in c]]
df

Unnamed: 0,rank_test_score,mean_test_score,mean_fit_time,mean_score_time,param_est,param_est__max_leaf_nodes,param_est__C,param_est__l1_ratio,param_est__max_rules,param_est__n_estimators,param_est__n_boosting_rounds,params,split0_test_score,split1_test_score,split2_test_score
4,1,0.827,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,0.1,0.5,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.813,0.853,0.815
3,2,0.824,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,0.1,0.0,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.814,0.852,0.806
5,3,0.823,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,0.1,1.0,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.807,0.85,0.813
8,4,0.822,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,1.0,1.0,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.814,0.853,0.799
7,5,0.821,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,1.0,0.5,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.814,0.853,0.797
6,6,0.821,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,1.0,0.0,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.814,0.853,0.795
10,7,0.82,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,10.0,0.5,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.815,0.853,0.792
9,8,0.82,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,10.0,0.0,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.815,0.853,0.792
11,9,0.82,0.002,0.001,"LogisticRegression(C=0.1, l1_ratio=0.5, penalt...",,10.0,1.0,,,,"{'est': LogisticRegression(C=0.1, l1_ratio=0.5...",0.815,0.853,0.792
15,10,0.81,2.07,0.046,TreeGAMClassifier(n_boosting_rounds=10),,,,,,100.0,{'est': TreeGAMClassifier(n_boosting_rounds=10...,0.809,0.802,0.82


In [110]:
est = m.est_.best_estimator_.named_steps['est']
print(est)

LogisticRegression(C=0.1, l1_ratio=0.5, penalty='elasticnet', solver='saga')


In [114]:
pd.DataFrame({'feature_names': feature_names, 'coef': est.coef_.flatten()}).round(3)

Unnamed: 0,feature_names,coef
0,Pregnant,0.295
1,plasma_glucose,0.914
2,Diastolic_blood_pressure,-0.135
3,Triceps_skin_fold_thickness,0.0
4,X_2-Hour_serum_insulin,-0.012
5,Body_mass_index,0.561
6,Diabetes_pedigree_function,0.299
7,Age,0.135
