In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.datasets import make_multilabel_classification
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# Preparing the data

We can generate a multi-output data with a make_multilabel_classification function. The target dataset contains 20 features (x), 5 classes (y), and 10000 samples. 

We'll define them in the parameters of the function.

In [3]:
x, y = make_multilabel_classification(n_samples=10000, n_features=20,
                                      n_classes=5, random_state=88)

The generated data looks as below. There are 20 features and 5 labels in this dataset.

In [4]:
for i in range(5): 
    print(x[i]," =====> ", y[i])

[5. 4. 0. 4. 3. 0. 1. 1. 0. 3. 0. 1. 6. 0. 0. 2. 0. 1. 6. 1.]  =====>  [1 0 0 0 0]
[2. 2. 0. 1. 5. 1. 2. 0. 7. 4. 1. 0. 2. 1. 5. 2. 0. 4. 0. 6.]  =====>  [0 0 0 0 1]
[3. 4. 2. 1. 4. 5. 2. 2. 4. 1. 1. 2. 3. 5. 2. 3. 0. 4. 5. 2.]  =====>  [0 1 0 1 0]
[0. 5. 2. 3. 2. 3. 7. 4. 4. 1. 3. 0. 5. 5. 2. 1. 3. 3. 2. 3.]  =====>  [0 0 0 0 0]
[3. 6. 2. 3. 2. 0. 1. 3. 2. 4. 0. 0. 3. 4. 1. 6. 0. 5. 0. 8.]  =====>  [1 0 0 0 1]


## Next, we'll split the data into the train and test parts.

In [5]:
xtrain, xtest, ytrain, ytest=train_test_split(x, y, train_size=0.8, random_state=88)
print(len(xtest)) 

2000


## Defining the model

We'll define the model with the MultiOutputClassifier class of sklearn. As an estimator, we'll use XGBClassifier and then we'll include the estimator into the MultiOutputClassifier class.

In [6]:
kf = KFold(n_splits=5)
for fn, (trn_idx, val_idx) in enumerate(kf.split(xtrain, ytrain)):
    print (fn, (trn_idx, val_idx))

0 (array([1600, 1601, 1602, ..., 7997, 7998, 7999]), array([   0,    1,    2, ..., 1597, 1598, 1599]))
1 (array([   0,    1,    2, ..., 7997, 7998, 7999]), array([1600, 1601, 1602, ..., 3197, 3198, 3199]))
2 (array([   0,    1,    2, ..., 7997, 7998, 7999]), array([3200, 3201, 3202, ..., 4797, 4798, 4799]))
3 (array([   0,    1,    2, ..., 7997, 7998, 7999]), array([4800, 4801, 4802, ..., 6397, 6398, 6399]))
4 (array([   0,    1,    2, ..., 6397, 6398, 6399]), array([6400, 6401, 6402, ..., 7997, 7998, 7999]))


In [7]:
classifier = MultiOutputClassifier(XGBClassifier())

clf = Pipeline([('classify', classifier)
               ])

#### We'll fit the model with training data and check the training accuracy.

In [8]:
clf.fit(xtrain, ytrain)
print(clf.score(xtrain, ytrain))

0.913875


## Predicting and accuracy check

In [9]:
# We'll predict the test data.

yhat = clf.predict(xtest)

We'll check the numbers of accuracy metrics for this prediction. Remember, we have five output labels in the ytest and the yhat data, thus we need to use them accordingly.

First, we'll check the area under the ROC with the roc_auc_score function.

In [10]:
auc_y1 = roc_auc_score(ytest[:,0],yhat[:,0])
auc_y2 = roc_auc_score(ytest[:,1],yhat[:,1])
auc_y3 = roc_auc_score(ytest[:,2],yhat[:,2])
auc_y4 = roc_auc_score(ytest[:,3],yhat[:,3])
auc_y5 = roc_auc_score(ytest[:,4],yhat[:,4])

print("ROC AUC y1: %.4f, y2: %.4f, y3: %.4f, y4: %.4f, y5: %.4f" % (auc_y1, auc_y2, auc_y3, auc_y4, auc_y5))

ROC AUC y1: 0.8230, y2: 0.8025, y3: 0.8091, y4: 0.8005, y5: 0.8086


The second method is to check the confusion matrics.

In [11]:
cm_y1 = confusion_matrix(ytest[:,0],yhat[:,0])
cm_y2 = confusion_matrix(ytest[:,1],yhat[:,1])
cm_y3 = confusion_matrix(ytest[:,2],yhat[:,2])
cm_y4 = confusion_matrix(ytest[:,3],yhat[:,3])
cm_y5 = confusion_matrix(ytest[:,4],yhat[:,4])

In [12]:
print(cm_y1)

[[1053  140]
 [ 191  616]]


In [13]:
print (cm_y2)

[[1189  150]
 [ 187  474]]


In [14]:
print (cm_y3)

[[1163  100]
 [ 223  514]]


In [15]:
print (cm_y4)

[[863 206]
 [192 739]]


In [16]:
print (cm_y5)

[[1016  158]
 [ 205  621]]


Finally, we'll check the classification report with the classification_report function.

In [17]:
cr_y1 = classification_report(ytest[:,0],yhat[:,0])
cr_y2 = classification_report(ytest[:,1],yhat[:,1])
cr_y3 = classification_report(ytest[:,2],yhat[:,2])
cr_y4 = classification_report(ytest[:,3],yhat[:,3])
cr_y5 = classification_report(ytest[:,4],yhat[:,4])

In [18]:
print (cr_y1)

              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1193
           1       0.81      0.76      0.79       807

    accuracy                           0.83      2000
   macro avg       0.83      0.82      0.83      2000
weighted avg       0.83      0.83      0.83      2000



In [19]:
print (cr_y2)

              precision    recall  f1-score   support

           0       0.86      0.89      0.88      1339
           1       0.76      0.72      0.74       661

    accuracy                           0.83      2000
   macro avg       0.81      0.80      0.81      2000
weighted avg       0.83      0.83      0.83      2000



In this tutorial, we've briefly learned how to classify multi-output data with MultiOutputClassifier in Python.
