In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
%load_ext autoreload

%autoreload 1
%aimport ds_tutorial.datasets
%aimport ds_tutorial.transformers

In [None]:
import numpy as np
import pandas as pd

import pickle

from pathlib import Path

# Load data

In [None]:
data_root = Path.home() / "data" / "tmp"
reuters_dir = data_root / "reuters21578"
reuters_corpus_path = reuters_dir / "corpus.pkl"
reuters = pickle.load(open(reuters_corpus_path, "rb"))
top_ten_ids, top_ten_names = reuters.top_n(n=10)

cache_dir = reuters_dir / "cache"

In [None]:
cat_ids, cat_names = reuters.top_n(n=90)
label_lookup = {k: v for k, v in zip(cat_ids, cat_names)}

In [None]:
label_lookup[6]

'iron-steel'

In [None]:
topic_lookup = {v: k for k, v in reuters.topics.items()}

## Build dataframe

In [None]:
df, top_ten_ids, train_labels, test_labels = reuters.build_dataframe()

In [None]:
df.head()

Unnamed: 0,modapte,category,label,date,title,dateline,body,newid,wd_name
0,train,"[interest, retail, ipi]","[0, 1, 2]",1987-03-11 18:14:49,U.S. ECONOMIC DATA KEY TO DEBT FUTURES OUTLOOK,"CHICAGO, March 11 -",U.S. economic data this week could be\nthe key...,4005,Wednesday
1,train,[earn],[3],1987-03-11 18:36:05,BANK OF BRITISH COLUMBIA 1ST QTR JAN 31 NET,"VANCOUVER, British Columbia, March 11 -\n",Oper shr loss two cts vs profit three cts\n ...,4012,Wednesday
2,train,[earn],[3],1987-03-11 18:38:02,RESTAURANT ASSOCIATES INC <RA> 4TH QTR JAN 3,"NEW YORK, March 11 -\n",Shr 25 cts vs 36 cts\n Net 1.4 mln vs 1.4 m...,4014,Wednesday
3,train,[earn],[3],1987-03-11 18:41:59,MICHIGAN GENERAL CORP <MGL> 4TH QTR,"SADDLE BROOK, N.J., March 11 -\n",Shr loss 1.02 dlrs vs 1.01 dlr\n Net loss 1...,4015,Wednesday
4,train,"[crude, nat-gas, iron-steel]","[4, 5, 6]",1987-03-11 18:45:36,"USX <X> PROVED OIL, GAS RESERVES FALL IN 1986","NEW YORK, March 11 -",USX Corp said proved reserves of oil\nand natu...,4016,Wednesday


# Build feature extraction pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.multiclass import OneVsRestClassifier

from sklearn.feature_selection import SelectKBest, chi2

import xgboost as xgb
import lightgbm as lgb

In [None]:
from ds_tutorial.transformers import TextFromColumns, TextStats, ColumnSelector, TextFromColumns2

In [None]:
df_train = df.query("modapte == 'train'")
df_test = df.query("modapte == 'test'")
y_train = MultiLabelBinarizer().fit_transform(df_train.label.values)
y_test = MultiLabelBinarizer().fit_transform(df_test.label.values)

In [None]:
pipeline = Pipeline(
    memory=str(cache_dir),
    steps=[
        ("union", FeatureUnion(n_jobs=1, transformer_list=[
            ("title_stats", Pipeline([
                ("column", ColumnSelector("title")),
                ("stats", TextStats()),
                #("scaled", StandardScaler()),
            ])),
            ("body_stats", Pipeline([
                ("column", ColumnSelector("body")),
                ("stats", TextStats()),
                #("scaled", StandardScaler()),
            ])),
            ("combined_text", Pipeline([
                ("column", TextFromColumns2()),
                ("tfidf", TfidfVectorizer()),
            ])),
        ])),
        # ("feature_selection", SelectKBest(k=20000)),
        # ("clf", OneVsRestClassifier(LinearSVC(C=1.5))),
        ("clf", OneVsRestClassifier(LogisticRegression(C=100))),
        # ("clf", OneVsRestClassifier(RandomForestClassifier())),
        # ("clf", OneVsRestClassifier(xgb.XGBClassifier())),
])

In [None]:
pipeline.fit(df_train, y_train)
y_pred = pipeline.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))



              precision    recall  f1-score   support

        earn      0.972     0.982     0.977      1087
         acq      0.975     0.929     0.952       719
    money-fx      0.786     0.737     0.761       179
       grain      0.969     0.826     0.891       149
       crude      0.911     0.815     0.860       189
       trade      0.835     0.692     0.757       117
    interest      0.890     0.679     0.771       131
        ship      0.941     0.539     0.686        89
       wheat      0.875     0.789     0.830        71
        corn      0.938     0.804     0.865        56

   micro avg      0.945     0.884     0.913      2787
   macro avg      0.909     0.779     0.835      2787
weighted avg      0.943     0.884     0.910      2787
 samples avg      0.759     0.758     0.755      2787



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
#pipeline.fit(df_train, y_train)
#y_pred = pipeline.predict(df_test)
#print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

# Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

## linear models

In [None]:
param_grid = {
    "clf__estimator__C": [1, 100],
    "clf": [
        OneVsRestClassifier(LinearSVC()),
        OneVsRestClassifier(LogisticRegression())
    ],
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, scoring="f1_micro")
grid_search.fit(df_train, y_train)
print(grid_search.best_estimator_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] clf=OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=1 


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=1, score=0.821662642290445, total=  39.8s
[CV] clf=OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.8s remaining:    0.0s
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))




[CV]  clf=OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=1, score=0.8502010138087747, total=  38.5s
[CV] clf=OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=1, score=0.8448095071653268, total=  42.1s
[CV] clf=OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.1min remaining:    0.0s


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))




[CV]  clf=OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=100, score=0.8167285645083159, total=  23.8s
[CV] clf=OneVsRestClassifier(estimator=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.5min remaining:    0.0s




  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=100, score=0.8518395757374875, total=  27.5s
[CV] clf=OneVsRestClassifier(estimator=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.0min remaining:    0.0s




  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None), clf__estimator__C=100, score=0.8517412935323383, total=  25.6s
[CV] clf=OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=1 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.5min remaining:    0.0s


  str(classes[c]))


  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=1, score=0.6865033061065732, total=  25.2s
[CV] clf=OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=1 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.9min remaining:    0.0s




  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=1, score=0.7244655581947743, total=  27.0s
[CV] clf=OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.4min remaining:    0.0s




  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=1, score=0.7243073295342896, total=  27.3s
[CV] clf=OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=100 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.9min remaining:    0.0s


  str(classes[c]))


  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=100, score=0.8287533512064343, total=  45.9s
[CV] clf=OneVsRestClassifier(estimator=LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=100 






  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=100, score=0.8595409386776293, total=  50.3s
[CV] clf=OneVsRestClassifier(estimator=LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=100 






  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


[CV]  clf=OneVsRestClassifier(estimator=LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None), clf__estimator__C=100, score=0.8600170502983803, total=  50.3s


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  7.4min finished






Pipeline(memory='/Users/jochen/data/tmp/reuters21578/cache',
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('title_stats', Pipeline(memory=None,
     steps=[('column', ColumnSelector(column='title', filter_none=True)), ('stats', TextStats()), ('scaled', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('body_stats', Pipeline(me...te=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])


In [None]:
print(grid_search.best_estimator_.steps[-1])

('clf', OneVsRestClassifier(estimator=LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))


In [None]:
print(grid_search.cv_results_["split1_test_score"])

[0.85020101 0.85183958 0.72446556 0.85954094]


In [None]:
print(grid_search.cv_results_["split1_test_score"])

[0.89369927 0.88828759 0.81870893 0.88828759]


In [None]:
y_pred = grid_search.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

              precision    recall  f1-score   support

        earn      0.985     0.982     0.983      1087
         acq      0.973     0.946     0.959       719
    money-fx      0.760     0.709     0.734       179
       grain      0.955     0.846     0.897       149
       crude      0.881     0.820     0.849       189
       trade      0.761     0.709     0.735       117
    interest      0.860     0.656     0.745       131
        ship      0.909     0.674     0.774        89
       wheat      0.873     0.775     0.821        71
        corn      0.882     0.804     0.841        56

   micro avg      0.939     0.891     0.914      2787
   macro avg      0.884     0.792     0.834      2787
weighted avg      0.936     0.891     0.912      2787
 samples avg      0.764     0.764     0.760      2787



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
#grid_search.cv_results_

## xgboost

In [None]:
import xgboost as xgb

In [None]:
pipeline = Pipeline(
    memory=str(cache_dir),
    steps=[
        ("union", FeatureUnion(n_jobs=1, transformer_list=[
            ("title_stats", Pipeline([
                ("column", ColumnSelector("title")),
                ("stats", TextStats()),
                ("scaled", StandardScaler()),
            ])),
            ("body_stats", Pipeline([
                ("column", ColumnSelector("body")),
                ("stats", TextStats()),
                ("scaled", StandardScaler()),
            ])),
            ("combined_text", Pipeline([
                ("column", TextFromColumns2()),
                ("tfidf", TfidfVectorizer()),
                ("svd", TruncatedSVD(n_components=300, random_state=2018))
            ])),
        ])),
        ("clf", OneVsRestClassifier(xgb.XGBClassifier(nthread=4, early_stopping_rounds=10)))
])

In [None]:
%%time
pipeline.fit(df_train, y_train)
y_pred = pipeline.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

              precision    recall  f1-score   support

        earn      0.982     0.980     0.981      1087
         acq      0.956     0.917     0.936       719
    money-fx      0.744     0.665     0.702       179
       grain      0.936     0.691     0.795       149
       crude      0.882     0.794     0.836       189
       trade      0.848     0.667     0.746       117
    interest      0.873     0.473     0.614       131
        ship      0.933     0.315     0.471        89
       wheat      0.940     0.662     0.777        71
        corn      0.944     0.607     0.739        56

   micro avg      0.941     0.841     0.888      2787
   macro avg      0.904     0.677     0.760      2787
weighted avg      0.937     0.841     0.878      2787
 samples avg      0.737     0.730     0.731      2787

CPU times: user 18min 49s, sys: 4.12 s, total: 18min 53s
Wall time: 4min 46s


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
print(pipeline.steps[-1])

('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=10, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1))


In [None]:
xgb_params = {
    "nthread": [4],
    "objective": ["multi:softprob"],
    "learning_rate": [0.05, 0.001],
    "max_depth": [10],
    "random_state": [2018],
    "gamma": [0, 0.1],
    "reg_alpha": [0],
    "n_estimators": [200],
}

param_grid = {
    "clf": [OneVsRestClassifier(xgb.XGBClassifier(early_stopping_rounds=3))]
}
param_grid.update({f"clf__estimator__{k}": v for k, v in xgb_params.items()})

In [None]:
param_grid

{'clf': [OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
         colsample_bytree=1, early_stopping_rounds=3, gamma=0,
         learning_rate=0.1, max_delta_step=0, max_depth=3,
         min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
         nthread=None, objective='binary:logistic', random_state=0,
         reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
         silent=True, subsample=1),
            n_jobs=None)],
 'clf__estimator__nthread': [4],
 'clf__estimator__objective': ['multi:softprob'],
 'clf__estimator__learning_rate': [0.05, 0.001],
 'clf__estimator__max_depth': [10],
 'clf__estimator__random_state': [2018],
 'clf__estimator__gamma': [0, 0.1],
 'clf__estimator__reg_alpha': [0],
 'clf__estimator__n_estimators': [200]}

In [None]:
%%time
clf = GridSearchCV(pipeline, param_grid, n_jobs=1, 
                   scoring='f1_micro',
                   verbose=2, refit=True)
clf.fit(df_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.5min remaining:    0.0s


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.3min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1,

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.9min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.8min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.3min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.8min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weigh

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.9min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weigh

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.3min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_wei

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.7min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_we

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.9min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_we

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.2min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=11.0min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos

  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 107.2min finished


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=11.6min
CPU times: user 7h 41min 3s, sys: 1min 30s, total: 7h 42min 33s
Wall time: 1h 59min 5s


In [None]:
y_pred = clf.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.956     0.987     0.971      1087
        acq      0.905     0.963     0.933       710
   money-fx      0.644     0.834     0.727       145
      grain      0.421     0.381     0.400        42
      crude      0.711     0.841     0.771       164
      trade      0.672     0.826     0.741       109
   interest      0.710     0.650     0.679       117
       ship      0.600     0.592     0.596        71
      wheat      0.739     0.618     0.673        55
       corn      0.508     0.689     0.585        45

avg / total      0.853     0.906     0.877      2545



  if diff:


In [None]:
clf.best_estimator_.steps[-1]

('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
        learning_rate=0.05, max_delta_step=0, max_depth=10,
        min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
        nthread=4, objective='multi:softprob', random_state=2018,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1))