- Make fastai work + commit to github
- Make learner work generically
- Add use-cases for HPs with FastAI
- Work on HPO Phase - callbacks etc...
- Work on recipes
- Work on constraints

In [1]:
#default_exp tabular_api

In [2]:
#export
from hypster.oo_hp import *
from hypster.hypster_prepare import *

In [3]:
import fastai2
from fastai2.tabular.all import *
from fastai2.metrics import *

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from copy import deepcopy

In [6]:
import optuna

In [7]:
SEED = 42

# Read Data

In [8]:
path = untar_data(URLs.ADULT_SAMPLE)
path.ls()

(#3) [Path('C:/Users/user/.fastai/data/adult_sample/adult.csv'),Path('C:/Users/user/.fastai/data/adult_sample/export.pkl'),Path('C:/Users/user/.fastai/data/adult_sample/models')]

In [9]:
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [10]:
df = df.sample(frac=0.3)

In [11]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']

In [12]:
cont_names = ['age', 'fnlwgt', 'education-num']

In [13]:
dep_var = "salary"

In [14]:
train_df, test_df = train_test_split(df, test_size=0.6, 
                                     random_state=SEED, 
                                     stratify=df[dep_var])

# Preprocessing

In [15]:
cat = Categorify()

### Fill Missing

In [16]:
FillMissing = prepare(FillMissing)

In [17]:
imp = FillMissing(add_col=HpBool("missing_col"))

In [18]:
norm = Normalize()

In [19]:
procs = [cat, imp, norm]

# DataBunch

In [20]:
TabularPandas = prepare(TabularPandas)

In [21]:
to = TabularPandas(train_df,
                   y_block = CategoryBlock(), 
                   y_names = dep_var,
                   splits = RandomSplitter()(range_of(train_df)),
                   cat_names = cat_names,
                   cont_names = cont_names,
                   procs = procs)

In [22]:
#dls = to.dataloaders(batch_size=2 ** HpInt("batch_size", 5, 9))
dls = to.dataloaders(batch_size=32)

# Learner

In [23]:
cbs = [TrackerCallback(monitor="roc_auc_score"), 
       ReduceLROnPlateau("roc_auc_score", patience=1)]

In [24]:
start_mom = HpFloat("start_mom", 0.85, 0.99)

In [25]:
tabular_learner = prepare(tabular_learner)

In [26]:
learn = tabular_learner(dls,
                        metrics=RocAuc(),
                        opt_func=HpCategorical("optimizer", [Adam, SGD, QHAdam]),
                        layers=HpVarLenList("layers", 1, 4, HpInt("layer_size", 50, 300, 50), same_value=False),
                        cbs=cbs,
                        moms=(start_mom, start_mom-0.1, start_mom), 
                        #wd_bn_bias=HpBool("wd_bn_bias"),
                        )

# Optuna

In [27]:
def run_learner(learner, n_trials=5):
    class Objective():
        def __init__(self, learner):
            self.learner = learner
        def __call__(self, trial):
            learner = self.learner
            learner = learner.sample(trial)
            learner.fit_one_cycle(2)
            res = learner.cbs[3].best
            return res
    
    objective = Objective(learner)
    optuna.logging.set_verbosity(0)
    pruner = optuna.pruners.NopPruner()
    study = optuna.create_study(direction="maximize", pruner=pruner)
    study.optimize(objective, n_trials=n_trials, timeout=600)
    return study

In [28]:
study = run_learner(learn, 3)

epoch,train_loss,valid_loss,roc_auc_score,time
0,0.419568,0.366763,0.740776,00:01
1,0.376765,0.348781,0.744736,00:01


[W 2020-05-08 14:01:54,414] Setting status of trial#1 as TrialState.FAIL because of the following error: KeyError("['education-num_na'] not in index")
Traceback (most recent call last):
  File "C:\Users\user\Anaconda3\lib\site-packages\optuna\study.py", line 677, in _run_trial
    result = func(trial)
  File "<ipython-input-27-20deadc4ebdf>", line 7, in __call__
    learner = learner.sample(trial)
  File "c:\python_workspace\hypster-v2\hypster\hypster_prepare.py", line 34, in sample
    self.sampled_args   = [sample_hp(arg, trial) for arg in self.args]
  File "c:\python_workspace\hypster-v2\hypster\hypster_prepare.py", line 34, in <listcomp>
    self.sampled_args   = [sample_hp(arg, trial) for arg in self.args]
  File "c:\python_workspace\hypster-v2\hypster\oo_hp.py", line 271, in sample_hp
    return hp.sample(trial)
  File "c:\python_workspace\hypster-v2\hypster\hypster_prepare.py", line 30, in sample
    base_object = self.base_call.sample(trial)
  File "c:\python_workspace\hypster-

KeyError: "['education-num_na'] not in index"

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

In [None]:
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))

In [None]:
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study.trials_dataframe()