# Training and model selection

Perform hyperparameter optimization and model selection using the raw competition data

In [None]:
# starting up a console attached to this kernel
%qtconsole
import os

# importing base code
os.chdir('/your-path/whats-cooking/code')
from base import *

# changing to competition dir
os.chdir('/your-path/whats-cooking')

### CV Framework

Here we define metrics, CV type (number of folds and repetitions), etc...

In [None]:
# whats cooking CV framework
def wc_framework(space):

    # load data
    global train
    df = copy.deepcopy(train)
    global y_train
    y = copy.deepcopy(y_train)

    # copy the search space
    space_copy = copy.deepcopy(space)

    # preprocessing with some parameters
    preproc_steps = [None, space['feat_sel'].pop('alg')(**space.pop('feat_sel')['args'])]
    space.pop('scaling')

    # framework with preprocessing and algo
    fmwk = supervised_framework(preproc_steps, space.pop('type'), space)
    fmwk_copy = copy.deepcopy(fmwk)

    # evaluation function choice
    acc_fn = metrics.accuracy_score
    report_fn = metrics.classification_report

    # number of repetitions and folds
    n = 1
    k = 5

    global eval_number
    eval_number += 1
    print 'eval_number:', eval_number, space_copy

    # repeat n times a k-fold cross-val
    res = []
    for i in range(n):

        res.append(strat_k_fold_cross_val(k, df, y, fmwk))

    # evaluating...
    accuracy_list = []
    for results in res:
        for key in results.keys():
            predictions = results[key]['out']['preds']
            y = results[key]['gtruth']
            accuracy_list.append(acc_fn(y, predictions))

    mean_acc = np.mean(accuracy_list)

    print 'accuracy:', mean_acc, 'std:', np.std(accuracy_list)

    return {'loss': 1 - mean_acc,
            'accuracy': mean_acc,
            'acc_sd': np.std(accuracy_list),
            'status': STATUS_OK,
            'space': space_copy,
            'fmwk': fmwk_copy}

In [None]:
# reading data
path = './eng-data/stemmed-word2'
gtruth = pd.read_json('./raw-data/train.json')['cuisine']
train_df = load_sparse_csr(path + '/train.npz')

# encode ground truth
with open('./raw-data/enc_dict.txt', 'r') as f:
    enc_dict = ast.literal_eval(f.read())
gtruth = gtruth.replace(enc_dict)

train_df

### Training

Experiments with different models

In [None]:
# copy data
train = copy.deepcopy(train_df)
y_train = copy.deepcopy(gtruth)

# smaller subset for experiments?

# dimensionality reduction
#svd_dimred = TruncatedSVD(n_components=500)
#train = svd_dimred.fit_transform(train)

### Linear SVM with SGD: single


In [None]:
sgdsvm_space = {'type': SGDClassifier,
                'loss': hp.choice('sgdsvm_loss', ['hinge']),
                'penalty': hp.choice('sgdsvm_pen', ['l2']),
                'alpha': hp.uniform('sgdsvm_alpha', 0.01, 0.001),
                'n_iter': hp.quniform('sgdsvm_iter', 5, 25, 5),
                'l1_ratio': hp.uniform('sgdsvm_l1r', 0.01, 1),
                'eta0': hp.uniform('sgdsvm_eta0', 0.001, 0.1),
                'learning_rate': hp.choice('sgdsvm_lr', ['constant', 'optimal', 'invscaling']),
                'class_weight': hp.choice('sgdsvm_cw', ['auto', None]),
                'scaling': hp.choice('sgdsvm_scaling', [None, StandardScaler(),
                                                     MinMaxScaler(),
                                                     MinMaxScaler(feature_range=(-1, 1))]),
                'feat_sel': {'alg': SelectPercentile,
                             'args': {'score_func': chi2,
                                      'percentile': 100}
                            }
                }


# support vector machines
eval_number = 0
trials = Trials()
best_svm = optimize(wc_framework, sgdsvm_space, 15, trials)

### Linear SVM with SGD: boosted ensemble

In [None]:
# boosting the best svm:
#algo = joblib.load('./models/sub3.pkl') 
algo = best_svm['result']['fmwk'].algo

adsvm_space = {'type': AdaBoostClassifier,
               'base_estimator': algo,
               'algorithm': 'SAMME',
               'n_estimators': hp.choice('n_estimators_abt', [10, 50]),
               'scaling': None
               }

eval_number = 0
trials = Trials()
best_adsvm = optimize(wc_framework, sgdsvm_space, 15, trials)

### Linear SVM with SGD: bagged ensemble

In [None]:
# bagging the best svm:
#algo = joblib.load('./models/sub3.pkl')
algo = best_svm['result']['fmwk'].algo

bagsvm_space = {'type': BaggingClassifier,
                'base_estimator': algo,
                'n_estimators': hp.choice('n_estimators_bg', [10, 50, 100]),
                'scaling': hp.choice('scaling_bg', [None]),
                'max_features': hp.uniform('mf_bg', 0.1, 1),
                'max_samples': hp.uniform('ms_bg', 0.1, 1)
                }

eval_number = 0
trials = Trials()
best_bagsvm = optimize(wc_framework, bagsvm_space, 15, trials)

### Logistic Regression with SGD: single

In [None]:
# logistic regression with SGD
eval_number = 0
trials = Trials()
best_logreg = optimize(wc_framework, sgdlog_space, 20, trials)

### Random forests: single

In [None]:
# dimensionality reduction
#svd_dimred = TruncatedSVD(n_components=500)
#train = svd_dimred.fit_transform(train)

rf_space = {'type': RandomForestClassifier,
            'n_estimators': 500,
            'criterion': hp.choice('rf_crit', ['gini', 'entropy']),
            'max_features': hp.choice('rf_maxfeat', ['sqrt', 'log2', None]),
            'class_weight': hp.choice('rf_cweight', ['auto', 'subsample', None]),
            'n_jobs': -1,
            'scaling': hp.choice('rf_scaling', [None])
            }

# random forests
eval_number = 0
trials = Trials()
best_rf = optimize(wc_framework, rf_space, 10, trials)

### XGBoost: single

In [None]:
xgbmulti_space = {'type': xgb.XGBClassifier,
                  'n_estimators' : 200,
                  'learning_rate' : hp.quniform('xgb_eta', 0.025, 0.5, 0.025),
                  'max_depth' : hp.quniform('xgb_max_depth', 1, 31, 2),
                  'min_child_weight' : hp.quniform('xgb_min_child_weight', 1, 6, 1),
                  'subsample' : hp.quniform('xgb_subsample', 0.5, 1, 0.05),
                  'gamma' : hp.quniform('xgb_gamma', 0.5, 1, 0.05),
                  'colsample_bytree' : hp.quniform('xgb_colsample_bytree', 0.5, 1, 0.05),
                  #'num_class' : 20,
                  #'eval_metric': 'merror',
                  'objective': hp.choice('xgb_objective', ['multi:softprob', 'multi:softmax']),
                  'scaling': [None],
                  'feat_sel': {'alg': SelectPercentile,
                              'args': {'score_func': chi2,
                                       'percentile': hp.quniform('prcnt', 5, 100, 5)}
                             }
                  }

eval_number = 0
trials = Trials()
best_xgb = optimize(wc_framework, xgbmulti_space, 20, trials)

### To submission:

In [None]:
# load data:
test_df = pd.read_json('raw-data/test.json')
test = test_features

# extracting best performing framework
fmwk = best_xgb['result']['fmwk']

# getting predictions
predictions = fmwk.fit_predict(train, test, y_train)['preds']

# saving pipeline:
joblib.dump(fmwk, './models/sub10.pkl') 

# building submission
submit = pd.DataFrame({'id': test_df['id'], 'cuisine': predictions})

# decoding predictions
with open('./raw-data/enc_dict.txt', 'r') as f:
    enc_dict = ast.literal_eval(f.read())

inv_map = {v: k for k, v in enc_dict.items()}
submit = submit.replace({'cuisine': inv_map}).loc[:, ['id', 'cuisine']]

# saving submission
submit.to_csv('./submissions/sub10.csv', index=False)

### Submission results

1) something went wrong with SVD (maybe should use just .transform() on test set) ~0.68 CV; 0.23 LB (sub1) <br>
2) raw data, tf-idf, SVM, SGD: ~0.734 CV; 0.74 LB (sub2) <br>
3) raw data, tf-idf, log regression, SGD: ~0.671 CV <br>
4) raw data, counts, SVM, SGD: ~0.720 CV <br>
5) raw data, cuisinefreqs, SVM, SGD: ~0.724 CV <br>
6) stemmed data, tf-idf, SVM, SGD: not better than (2) <br>
7) stemmed data, tf-idf, SVM, SGD, 2-grams: 0.742 CV; 0.74206 LB (sub3) <br>
8) stemmed data, tf-idf, bagged SVM (50 est), SGD, 2-grams: 0.748 CV; 0.74638 LB (sub4) <br>
9) stemmed data, tf-idf, bagged SVM (50 est), SGD, 5-grams, min_df=4: 0.744 CV; <br>
10) stemmed data, tf-idf, single XGBoost (100 est), 2-grams: 0.786 CV, 0.7855 LB (sub5) <br>
11) stemmed data, tf-idf, single XGBoost (5000 est), 2-grams: 0.7857 LB (sub6) <br>
12) stemmed data, tf-idf, single XGBoost (100 est), 5-grams (min=8) : 0.781 <br>