In [1]:
#default_exp sklearn

In [2]:
#export
from hypster.oo_hp import *
from hypster.hypster_prepare import *

import fastai2
from fastai2.tabular.all import *
from fastai2.metrics import *

from sklearn.model_selection import train_test_split

from copy import deepcopy

import optuna

In [3]:
#export
SEED = 42

# Read Data

In [4]:
#export
path = untar_data(URLs.ADULT_SAMPLE)
path.ls()

(#3) [Path('C:/Users/user/.fastai/data/adult_sample/adult.csv'),Path('C:/Users/user/.fastai/data/adult_sample/export.pkl'),Path('C:/Users/user/.fastai/data/adult_sample/models')]

In [5]:
#export
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [6]:
#export
df = df.sample(frac=0.1)

In [7]:
#export
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
dep_var = "salary"

In [8]:
y = np.where(df[dep_var] == ">=50k", 1, 0)
df.drop(dep_var, axis=1, inplace=True)

In [9]:
#export
train_df, test_df, y_train, y_test = train_test_split(df, y, test_size=0.6, 
                                                     random_state=SEED, 
                                                     stratify=y)

# Preprocessing

In [10]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
24777,46,Private,163229,7th-8th,4.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States
28773,22,Private,231053,11th,7.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,70,United-States
486,44,Private,115411,Some-college,10.0,Divorced,,Not-in-family,White,Male,0,0,40,United-States
13376,25,Private,486332,HS-grad,9.0,Divorced,Other-service,Not-in-family,White,Male,0,0,40,Mexico
27583,19,Private,232392,HS-grad,9.0,Never-married,Other-service,Other-relative,White,Female,0,0,40,United-States


In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
from category_encoders import BinaryEncoder

In [16]:
RandomForestClassifier = prepare(RandomForestClassifier)
SimpleImputer = prepare(SimpleImputer)
Pipeline = prepare(Pipeline)
ColumnTransformer = prepare(ColumnTransformer)
FeatureUnion = prepare(FeatureUnion)

In [17]:
imputer = SimpleImputer(strategy=HpCategorical("imp_strategy", ["mean", "median"]))

In [18]:
cat_enc = BinaryEncoder()

In [19]:
rf = RandomForestClassifier(n_estimators=HpInt("n_trees", 10, 40, 10),
                            criterion=HpCategorical("criterion", ['gini', 'entropy']),
                            max_depth=HpToggle("max_depth_toggle", HpInt("max_depth", 5, 30, 5)),
                            max_features=HpCategorical("max_features", ['auto', 'sqrt', 'log2', None]),
                            max_leaf_nodes=HpToggle("max_leaf_nodes_tog", HpInt("max_leaf_nodes", 2, 20)),
                            min_impurity_decrease=0.0,
                            bootstrap=HpBool("bootstrap"),
                            oob_score=False,
                            n_jobs=-1,
                            random_state=SEED,
                            verbose=0,
                            warm_start=False,
                            class_weight=None,
                            ccp_alpha=0.0,
                            max_samples=None
                            )

In [20]:
cont_imp  = ColumnTransformer([("imp", imputer, cont_names)])

In [21]:
cat_enc   = ColumnTransformer([("cat_enc", cat_enc, cat_names)])

In [22]:
fe = FeatureUnion([("cat", cat_enc), ("cont", cont_imp)])

In [23]:
pipe = Pipeline([("fe", fe), ("model", rf)])

In [24]:
#export
import datetime
def run_learner(fit_method, get_metric, n_trials=5): #learner
    class Objective():
        def __init__(self, fit_method, get_metric): #learner
            #self.learner   = learner
            self.fit_method = fit_method
            self.get_metric = get_metric
            
        def __call__(self, trial): 
            #learner = self.learner.sample(trial)
            self.fit_method.sample(trial)
            res = self.get_metric.sample(trial)
            #print(self.fit_method.base_call)
            #print(self.get_metric.base_call.base_call)
            print(res)
            return res

    objective = Objective(fit_method, get_metric) #learner
    optuna.logging.set_verbosity(0)
    pruner = optuna.pruners.NopPruner()
    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    study = optuna.create_study(direction="maximize", study_name = now, pruner=pruner)
    study.optimize(objective, n_trials=n_trials, n_jobs=1, timeout=600)
    return study

In [25]:
#export
study = run_learner(#learner    = learner,
                    fit_method = pipe.fit(train_df, y_train),
                    get_metric = pipe.score(test_df, y_test),
                    n_trials   = 3
                   )

0.8091095189355169
0.8147389969293757
0.8065506653019447


In [26]:
#export
print("Number of finished trials: {}".format(len(study.trials)))

Number of finished trials: 3


In [27]:
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))

Best trial:
  Value: 0.8147389969293757


In [28]:
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

  Params: 
    imp_strategy: median
    n_trees: 30
    criterion: entropy
    max_depth_toggle: True
    max_depth: 25
    max_features: auto
    max_leaf_nodes_tog: True
    max_leaf_nodes: 19
    bootstrap: False


In [29]:
#export
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,params_bootstrap,params_criterion,params_imp_strategy,params_max_depth,params_max_depth_toggle,params_max_features,params_max_leaf_nodes,params_max_leaf_nodes_tog,params_n_trees,state
0,0,0.80911,2020-05-30 17:07:24.207868,2020-05-30 17:07:24.595249,True,gini,median,,False,auto,17,True,30,COMPLETE
1,1,0.814739,2020-05-30 17:07:24.596247,2020-05-30 17:07:24.966332,False,entropy,median,25.0,True,auto,19,True,30,COMPLETE
2,2,0.806551,2020-05-30 17:07:24.967332,2020-05-30 17:07:25.338063,False,gini,median,10.0,True,log2,12,True,10,COMPLETE
