<a href="https://colab.research.google.com/github/gorkemozkaya/Data-Science-Notes/blob/master/reproducing_bugs/XGBoost_multi_label_classification_workaround_for_hummingbird_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description:

Hummingbird by defauld does not support XGBoost multi-label classifier. This notebook suggests a workaround. Since a multi-label classifier is essentially a separate binary-classifier for each class. We will extract a binary classifier for each class, and convert each one of them to Pytorch-jit separately.  

### In this example, the multi-label model is a tree-ensemble with 100 trees for each of the 5 classes.
### We are going to extract a binary classifier with 100 trees, corresponding to the first class

### Installations:

In [1]:
!pip install seaborn hummingbird_ml==0.4.8 torch==1.10.2
!pip install xgboost==1.6.0
!pip install xgboost-ray==0.1.16
!pip install hummingbird-ml==0.4.8

[31mERROR: Could not find a version that satisfies the requirement torch==1.10.2 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.10.2[0m[31m


In [2]:
!pip freeze | grep ray
!pip freeze | grep xgboost
!pip freeze | grep hummingbird

array-record==0.4.0
ray==2.5.1
xarray==2022.12.0
xarray-einstats==0.5.1
xgboost-ray==0.1.16
xgboost==1.6.0
xgboost-ray==0.1.16
hummingbird-ml==0.4.8


### Imports and Setup

In [3]:
import os
import sys
from matplotlib import pyplot as plt


import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import hummingbird
from hummingbird.ml import convert
print(xgb.__version__)
print(hummingbird.__version__)

1.6.0
0.4.8


multi label classification

In [4]:
from sklearn.datasets import make_multilabel_classification
import numpy as np

X, y = make_multilabel_classification(
    n_samples=32, n_classes=5, n_labels=3, random_state=0
)
clf = xgb.XGBClassifier(tree_method="hist")

In [5]:
clf.fit(X, y)


### Multi-label model outputs do not match with Hummingbird

In [6]:
clf.predict_proba(X)[:10, :]

array([[0.25477213, 0.08950111, 0.95902556, 0.9738694 , 0.9901391 ],
       [0.97502327, 0.9958663 , 0.10001647, 0.9412866 , 0.00860851],
       [0.00908493, 0.94113415, 0.09247999, 0.8853672 , 0.85925454],
       [0.17950186, 0.02149026, 0.12937637, 0.11357188, 0.9932581 ],
       [0.9783397 , 0.93622774, 0.97306305, 0.99218154, 0.9539664 ],
       [0.20212294, 0.9115374 , 0.01526665, 0.7947902 , 0.00625833],
       [0.93834007, 0.8063199 , 0.89752614, 0.94601154, 0.85123926],
       [0.05156172, 0.9899775 , 0.13533044, 0.15330161, 0.01595211],
       [0.01569671, 0.9366795 , 0.02338592, 0.08757982, 0.01937465],
       [0.9899165 , 0.03310569, 0.9548131 , 0.9469307 , 0.09280243]],
      dtype=float32)

In [7]:
clf.n_classes_ = 5

In [8]:
import torch
xgb_multi_label_torch = convert(clf, torch.jit.__name__, X[0:1])
xgb_multi_label_torch.predict_proba(X)[:10, :]

array([[2.11652205e-03, 6.08567381e-04, 1.44902527e-01, 2.30733797e-01,
        6.21638656e-01],
       [1.31838098e-01, 8.13613892e-01, 3.75316042e-04, 5.41433617e-02,
        2.93253979e-05],
       [3.06348986e-04, 5.34218729e-01, 3.40504339e-03, 2.58075029e-01,
        2.03994885e-01],
       [1.47975853e-03, 1.48551117e-04, 1.00513326e-03, 8.66617018e-04,
        9.96499896e-01],
       [1.85417593e-01, 6.02666251e-02, 1.48292542e-01, 5.20951748e-01,
        8.50714743e-02],
       [1.75282881e-02, 7.12975740e-01, 1.07271434e-03, 2.67987490e-01,
        4.35757451e-04],
       [2.96159863e-01, 8.10197964e-02, 1.70452133e-01, 3.41007501e-01,
        1.11360654e-01],
       [5.48124372e-04, 9.95884836e-01, 1.57799455e-03, 1.82548759e-03,
        1.63441538e-04],
       [1.06681348e-03, 9.89588380e-01, 1.60191499e-03, 6.42122235e-03,
        1.32171984e-03],
       [7.15110481e-01, 2.49408302e-04, 1.53919190e-01, 1.29975691e-01,
        7.45151483e-04]], dtype=float32)

## Let's extract the tree-ensemble corresponding to the first class

In [9]:
clf._Booster.save_model("model.json")

In [10]:
import json
from copy import deepcopy
model_json = json.load(open("model.json"))

In [11]:
model_json_modified = deepcopy(model_json)

In [12]:
model_json_modified['learner']['learner_model_param']['num_target'] = '1'

In [13]:
model_json_modified['learner']['gradient_booster']['model']['trees'] = model_json_modified['learner']['gradient_booster']['model']['trees'][::5]

In [14]:
for i, tree in enumerate(model_json_modified['learner']['gradient_booster']['model']['trees']):
  tree["id"] = i

In [15]:
model_json_modified['learner']['gradient_booster']['model']['tree_info'] = model_json_modified['learner']['gradient_booster']['model']['tree_info'][::5]

In [16]:
model_json_modified['learner']['gradient_booster']['model']['gbtree_model_param']['num_trees'] = "100"

In [17]:
json.dump(model_json_modified, open('modified.json', 'w'))

In [18]:
bst = xgb.Booster()

In [19]:
bst.load_model('modified.json')

In [20]:
from copy import copy

In [21]:
clf_modified = copy(clf)

In [22]:
clf_modified._Booster = bst

In [23]:
clf_modified.predict_proba(X)

array([[0.7452279 , 0.25477213],
       [0.02497673, 0.97502327],
       [0.99091506, 0.00908493],
       [0.8204981 , 0.17950186],
       [0.02166033, 0.9783397 ],
       [0.7978771 , 0.20212294],
       [0.06165993, 0.93834007],
       [0.9484383 , 0.05156172],
       [0.9843033 , 0.01569671],
       [0.0100835 , 0.9899165 ],
       [0.9890338 , 0.01096616],
       [0.06005919, 0.9399408 ],
       [0.1165427 , 0.8834573 ],
       [0.14652663, 0.85347337],
       [0.8990026 , 0.10099739],
       [0.86152565, 0.13847438],
       [0.12062764, 0.87937236],
       [0.17292649, 0.8270735 ],
       [0.04956883, 0.95043117],
       [0.99713695, 0.00286302],
       [0.91014534, 0.08985464],
       [0.94596297, 0.05403705],
       [0.1191988 , 0.8808012 ],
       [0.01439768, 0.9856023 ],
       [0.16040564, 0.83959436],
       [0.04793268, 0.9520673 ],
       [0.02277887, 0.97722113],
       [0.95172095, 0.04827907],
       [0.89401376, 0.10598624],
       [0.0933488 , 0.9066512 ],
       [0.

In [28]:
clf_modified.n_classes_ = 2

In [29]:
import torch
xgb_binary_torch = convert(clf_modified, torch.jit.__name__, X[0:1])

In [30]:
xgb_binary_torch.predict_proba(X)

array([[0.7452278 , 0.25477216],
       [0.02497673, 0.97502327],
       [0.99091506, 0.00908493],
       [0.82049817, 0.17950185],
       [0.02166033, 0.9783397 ],
       [0.7978771 , 0.2021229 ],
       [0.06165987, 0.9383401 ],
       [0.9484383 , 0.05156171],
       [0.9843033 , 0.01569672],
       [0.0100835 , 0.9899165 ],
       [0.9890338 , 0.01096616],
       [0.06005919, 0.9399408 ],
       [0.11654264, 0.88345736],
       [0.14652663, 0.85347337],
       [0.8990027 , 0.10099736],
       [0.86152565, 0.13847438],
       [0.12062776, 0.87937224],
       [0.17292655, 0.82707345],
       [0.04956871, 0.9504313 ],
       [0.99713695, 0.00286302],
       [0.91014534, 0.08985466],
       [0.9459629 , 0.05403709],
       [0.1191988 , 0.8808012 ],
       [0.01439768, 0.9856023 ],
       [0.16040558, 0.8395944 ],
       [0.04793268, 0.9520673 ],
       [0.02277887, 0.97722113],
       [0.9517209 , 0.0482791 ],
       [0.8940137 , 0.10598628],
       [0.09334868, 0.9066513 ],
       [0.

## Now they match on the probability outputs corresponding to class-1

In [31]:
xgb_binary_torch.predict_proba(X)[:, [1]] # this is a binary classifier, so we have to use the second column corresponding to prob-class-true.

array([[0.25477216],
       [0.97502327],
       [0.00908493],
       [0.17950185],
       [0.9783397 ],
       [0.2021229 ],
       [0.9383401 ],
       [0.05156171],
       [0.01569672],
       [0.9899165 ],
       [0.01096616],
       [0.9399408 ],
       [0.88345736],
       [0.85347337],
       [0.10099736],
       [0.13847438],
       [0.87937224],
       [0.82707345],
       [0.9504313 ],
       [0.00286302],
       [0.08985466],
       [0.05403709],
       [0.8808012 ],
       [0.9856023 ],
       [0.8395944 ],
       [0.9520673 ],
       [0.97722113],
       [0.0482791 ],
       [0.10598628],
       [0.9066513 ],
       [0.94275355],
       [0.03731751]], dtype=float32)

In [32]:
clf.predict_proba(X)[:, [0]]

array([[0.25477213],
       [0.97502327],
       [0.00908493],
       [0.17950186],
       [0.9783397 ],
       [0.20212294],
       [0.93834007],
       [0.05156172],
       [0.01569671],
       [0.9899165 ],
       [0.01096616],
       [0.9399408 ],
       [0.8834573 ],
       [0.85347337],
       [0.10099739],
       [0.13847438],
       [0.87937236],
       [0.8270735 ],
       [0.95043117],
       [0.00286302],
       [0.08985464],
       [0.05403705],
       [0.8808012 ],
       [0.9856023 ],
       [0.83959436],
       [0.9520673 ],
       [0.97722113],
       [0.04827907],
       [0.10598624],
       [0.9066512 ],
       [0.94275355],
       [0.03731751]], dtype=float32)