In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
%load_ext autoreload

%autoreload 1
%aimport ds_tutorial.datasets
%aimport ds_tutorial.transformers

In [None]:
import numpy as np
import pandas as pd

import pickle

from pathlib import Path

# Load data

In [None]:
data_root = Path.home() / "data" / "tmp"
reuters_dir = data_root / "reuters21578"
reuters_corpus_path = reuters_dir / "corpus.pkl"
reuters = pickle.load(open(reuters_corpus_path, "rb"))
top_ten_ids, top_ten_names = reuters.top_n(n=10)

cache_dir = reuters_dir / "cache"

## Build dataframe

In [None]:
train_docs, test_docs = reuters.split_modapte()
docs = train_docs + test_docs
train_labels = reuters.get_labels(train_docs, set(top_ten_ids))
test_labels = reuters.get_labels(test_docs, set(top_ten_ids))

In [None]:
from ds_tutorial.datasets import build_reuters_dataframe
df, top_ten_ids, train_labels, test_labels = build_reuters_dataframe(
    docs, reuters.topics, train_labels, test_labels, top_ten_ids)

In [None]:
df.head()

Unnamed: 0,modapte,category,label,date,title,dateline,body,newid,wd_name
0,train,interest,0,1987-03-11 18:14:49,U.S. ECONOMIC DATA KEY TO DEBT FUTURES OUTLOOK,"CHICAGO, March 11 -",U.S. economic data this week could be\nthe key...,4005,Wednesday
1,train,earn,3,1987-03-11 18:36:05,BANK OF BRITISH COLUMBIA 1ST QTR JAN 31 NET,"VANCOUVER, British Columbia, March 11 -\n",Oper shr loss two cts vs profit three cts\n ...,4012,Wednesday
2,train,earn,3,1987-03-11 18:38:02,RESTAURANT ASSOCIATES INC <RA> 4TH QTR JAN 3,"NEW YORK, March 11 -\n",Shr 25 cts vs 36 cts\n Net 1.4 mln vs 1.4 m...,4014,Wednesday
3,train,earn,3,1987-03-11 18:41:59,MICHIGAN GENERAL CORP <MGL> 4TH QTR,"SADDLE BROOK, N.J., March 11 -\n",Shr loss 1.02 dlrs vs 1.01 dlr\n Net loss 1...,4015,Wednesday
4,train,crude,4,1987-03-11 18:45:36,"USX <X> PROVED OIL, GAS RESERVES FALL IN 1986","NEW YORK, March 11 -",USX Corp said proved reserves of oil\nand natu...,4016,Wednesday


# Build feature extraction pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
from ds_tutorial.transformers import TextFromColumns, TextStats, ColumnSelector, TextFromColumns2

In [None]:
df_train = df.query("modapte == 'train'")
df_test = df.query("modapte == 'test'")
y_train = df_train.label.values
y_test = df_test.label.values

In [None]:
pipeline = Pipeline(
    memory=str(cache_dir),
    steps=[
        ("union", FeatureUnion(n_jobs=1, transformer_list=[
            ("title_stats", Pipeline([
                ("column", ColumnSelector("title")),
                ("stats", TextStats()),
                ("scaled", StandardScaler()),
            ])),
            ("body_stats", Pipeline([
                ("column", ColumnSelector("body")),
                ("stats", TextStats()),
                ("scaled", StandardScaler()),
            ])),
            ("combined_text", Pipeline([
                ("column", TextFromColumns2()),
                ("tfidf", TfidfVectorizer()),
            ])),
        ])),
        #("clf", LinearSVC(C=1.5)),
        ("clf", LogisticRegression(C=100, solver="liblinear", multi_class="ovr")),
        # ("clf", RandomForestClassifier())
        # ("clf", xgb.XGBClassifier())
])

In [None]:
pipeline.fit(df_train, y_train)
y_pred = pipeline.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

              precision    recall  f1-score   support

        earn      0.977     0.989     0.983      1087
         acq      0.925     0.976     0.950       710
    money-fx      0.712     0.800     0.753       145
       grain      0.583     0.500     0.538        42
       crude      0.783     0.878     0.828       164
       trade      0.699     0.872     0.776       109
    interest      0.794     0.726     0.759       117
        ship      0.672     0.577     0.621        71
       wheat      0.704     0.691     0.697        55
        corn      0.638     0.667     0.652        45

   micro avg      0.887     0.919     0.902      2545
   macro avg      0.749     0.768     0.756      2545
weighted avg      0.888     0.919     0.902      2545



# Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

## linear models

In [None]:
param_grid = {
    "clf__C": [1, 100],
    "clf": [LinearSVC(), LogisticRegression()],
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, scoring="f1_micro")
grid_search.fit(df_train, y_train)
print(grid_search.best_estimator_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] clf=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=1 




[CV]  clf=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=1, score=0.8709431080565102, total=   8.6s
[CV] clf=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.9s remaining:    0.0s


[CV]  clf=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=1, score=0.8940858136838037, total=   9.2s
[CV] clf=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   20.3s remaining:    0.0s


[CV]  clf=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=1, score=0.8876755070202809, total=  10.0s
[CV] clf=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   31.6s remaining:    0.0s


[CV]  clf=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=100, score=0.8671248568155785, total=   9.2s
[CV] clf=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   42.4s remaining:    0.0s


[CV]  clf=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=100, score=0.885195206803247, total=  10.3s
[CV] clf=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   54.0s remaining:    0.0s


[CV]  clf=LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0), clf__C=100, score=0.8779251170046802, total=   9.1s
[CV] clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=1 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.1min remaining:    0.0s


[CV]  clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=1, score=0.81061473844979, total=   7.2s
[CV] clf=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=1 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.2min remaining:    0.0s


[CV]  clf=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=1, score=0.818708929261693, total=   8.4s
[CV] clf=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.4min remaining:    0.0s


[CV]  clf=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=1, score=0.8213728549141965, total=   8.2s
[CV] clf=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=100 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.5min remaining:    0.0s


[CV]  clf=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=100, score=0.8652157311951126, total=  11.0s
[CV] clf=LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=100 
[CV]  clf=LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=100, score=0.8882875918051798, total=  14.3s
[CV] clf=LogisticRegression(C=100, class_weight=None, dual=

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  2.2min finished


Pipeline(memory='/Users/jochen/data/tmp/reuters21578/cache',
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('title_stats', Pipeline(memory=None,
     steps=[('column', ColumnSelector(column='title', filter_none=True)), ('stats', TextStats()), ('scaled', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('body_stats', Pipeline(me...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])


In [None]:
y_pred = grid_search.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.979     0.989     0.984      1087
        acq      0.937     0.977     0.957       710
   money-fx      0.739     0.800     0.768       145
      grain      0.647     0.524     0.579        42
      crude      0.802     0.890     0.844       164
      trade      0.732     0.853     0.788       109
   interest      0.821     0.744     0.780       117
       ship      0.698     0.620     0.657        71
      wheat      0.745     0.745     0.745        55
       corn      0.646     0.689     0.667        45

avg / total      0.900     0.923     0.911      2545



In [None]:
#grid_search.cv_results_

## xgboost

In [None]:
import xgboost as xgb

In [None]:
pipeline = Pipeline(
    memory=str(cache_dir),
    steps=[
        ("union", FeatureUnion(n_jobs=1, transformer_list=[
            ("title_stats", Pipeline([
                ("column", ColumnSelector("title")),
                ("stats", TextStats()),
                ("scaled", StandardScaler()),
            ])),
            ("body_stats", Pipeline([
                ("column", ColumnSelector("body")),
                ("stats", TextStats()),
                ("scaled", StandardScaler()),
            ])),
            ("combined_text", Pipeline([
                ("column", TextFromColumns2()),
                ("tfidf", TfidfVectorizer()),
                ("svd", TruncatedSVD(n_components=300, random_state=2018))
            ])),
        ])),
        ("clf", xgb.XGBClassifier(nthread=4, early_stopping_rounds=10))
])

In [None]:
%%time
pipeline.fit(df_train, y_train)
y_pred = pipeline.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


In [None]:
print(pipeline.steps[-1])

('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=10, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1))


In [None]:
xgb_params = {
    "nthread": [4],
    "objective": ["multi:softprob"],
    "learning_rate": [0.05, 0.001],
    "max_depth": [10],
    "random_state": [2018],
    "gamma": [0, 0.1],
    "reg_alpha": [0],
    "n_estimators": [200],
}

param_grid = {
    "clf": [xgb.XGBClassifier(early_stopping_rounds=3)]
}
param_grid.update({f"clf__{k}": v for k, v in xgb_params.items()})

In [None]:
param_grid

{'clf': [XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
         colsample_bytree=1, early_stopping_rounds=3, gamma=0,
         learning_rate=0.1, max_delta_step=0, max_depth=3,
         min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
         nthread=None, objective='binary:logistic', random_state=0,
         reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
         silent=True, subsample=1)],
 'clf__nthread': [4],
 'clf__objective': ['multi:softprob'],
 'clf__learning_rate': [0.05, 0.001],
 'clf__max_depth': [10],
 'clf__random_state': [2018],
 'clf__gamma': [0, 0.1],
 'clf__reg_alpha': [0],
 'clf__n_estimators': [200]}

In [None]:
%%time
clf = GridSearchCV(pipeline, param_grid, n_jobs=1, 
                   scoring='f1_micro',
                   verbose=2, refit=True)
clf.fit(df_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.5min remaining:    0.0s


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.3min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1,

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.9min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.8min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.3min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.8min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weigh

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.9min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weigh

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.3min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_wei

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.7min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_we

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total= 6.9min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_we

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=10.2min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_

  if diff:
  if diff:


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=11.0min
[CV] clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos

  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 107.2min finished


[CV]  clf=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
       learning_rate=0.001, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=4, objective='multi:softprob', random_state=2018,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), clf__gamma=0.1, clf__learning_rate=0.001, clf__max_depth=10, clf__n_estimators=200, clf__nthread=4, clf__objective=multi:softprob, clf__random_state=2018, clf__reg_alpha=0, total=11.6min
CPU times: user 7h 41min 3s, sys: 1min 30s, total: 7h 42min 33s
Wall time: 1h 59min 5s


In [None]:
y_pred = clf.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.956     0.987     0.971      1087
        acq      0.905     0.963     0.933       710
   money-fx      0.644     0.834     0.727       145
      grain      0.421     0.381     0.400        42
      crude      0.711     0.841     0.771       164
      trade      0.672     0.826     0.741       109
   interest      0.710     0.650     0.679       117
       ship      0.600     0.592     0.596        71
      wheat      0.739     0.618     0.673        55
       corn      0.508     0.689     0.585        45

avg / total      0.853     0.906     0.877      2545



  if diff:


In [None]:
clf.best_estimator_.steps[-1]

('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, early_stopping_rounds=3, gamma=0.1,
        learning_rate=0.05, max_delta_step=0, max_depth=10,
        min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
        nthread=4, objective='multi:softprob', random_state=2018,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1))