In [None]:
from notebook_prelude import *

In [None]:
import sklearn
from sklearn import model_selection

CV_FIRST = 3
CV_SECOND = 3
SHUFFLE = True
RANDOM_STATE = 42

def get_k_fold_splitter(n_splits, shuffle = SHUFFLE, random_state = RANDOM_STATE):
    return model_selection.StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

# Initial splitter into train/validation and test set
cv_first = get_k_fold_splitter(n_splits=CV_FIRST)

# Splitter for parameter tuning
cv_second = get_k_fold_splitter(n_splits=CV_SECOND)

In [None]:
import experiments
from sklearn.model_selection import GridSearchCV
from transformers.pipelines.classifiers import get_classifier_params

tasks = experiments.get_all_tasks()
classifier_params = get_classifier_params()

cv = get_k_fold_splitter(3)

TEST_FIT = True

done = collections.defaultdict(lambda: False)

results = {}
all_params = {}
for task in tasks:
    assert task
    if done[task.type]: continue
    if task.type == 'dummy': continue
    if 'v1' in task.name: continue
    print('STARTING {:30} ({})'.format(task.type, task.name))

    X, Y, estimator, params = task.fn()

    params = dict(classifier_params, **params)
    
    # Remove "voided" params
    params = {k: v for k, v in params.items() if v is not None}
    
    all_params[task.type] = params
    
    if TEST_FIT:
        try:
            gscv = GridSearchCV(estimator=estimator, param_grid=params, n_jobs=2, cv = cv, verbose = 2, scoring = 'f1_macro')
            result = gscv.fit(X, Y)
            results[task] = result
            print(result)
        except Exception as e:
            print(e)
    break
    print('FINISHED', task.type)
    done[task.type] = True

In [None]:
for task, result in results.items():
    print(task, result.best_score_, result.best_params_)

In [None]:
def test(a = 2, b = 3):
    c = 'abc'
    print(locals())

test()

In [None]:
from transformers.pipelines import pipeline_helper
for task_type, params in all_params.items():
    params_ = sklearn.model_selection.ParameterGrid(params)
    print('{:24} #Params: {:4}'.format(task_type, len(params_)))
    params_clean = pipeline_helper.remove_complex_types(params)
    for k, v in params_clean.items():
        print('\t{:90} {}'.format(k, v))
    print('\n'*2)

In [None]:
pd.DataFrame(tasks).type.value_counts().sort_index().to_frame()