In [1]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
    
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, matthews_corrcoef

import xgboost as xgb

import numpy as np

In [2]:
RAND = 42

# 1. Preparing data for XGBoost
We will use XGBoost native API as it allows for better flexibility with Callbacks and early stopping which are crucial for the development. In order to do that, we need to prepare our data and turn it into DMatrix with categorical features.

In [3]:
X_train = pd.read_parquet('/kaggle/input/ps4e8-data-eng/train.parquet', engine='pyarrow')
y_train = X_train.pop('class')
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)

X_test = pd.read_parquet('/kaggle/input/ps4e8-data-eng/test.parquet', engine='pyarrow')
y_test = X_test.pop('class')
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

X_val = pd.read_parquet('/kaggle/input/ps4e8-data-eng/val.parquet', engine='pyarrow')
dval = xgb.DMatrix(X_val, enable_categorical=True)
assert 'class' not in X_val.columns

X_train

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2413129,16.160,0,6,1,0,4,-1,0,16.890,17.660,3,0,0,0,1,1,5,-1,0,2
1910266,3.420,4,-1,10,0,5,2,5,3.620,15.310,-1,3,8,-1,-1,0,0,-1,0,2
1213509,8.110,0,1,8,0,-1,0,0,11.250,15.030,-1,-1,0,0,1,1,5,-1,0,2
387249,6.670,5,10,7,1,1,0,6,7.320,16.530,-1,-1,0,-1,-1,0,0,-1,0,0
2498558,12.230,0,2,5,1,4,-1,0,10.180,16.800,-1,-1,0,-1,-1,1,-1,-1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2258055,3.840,1,1,5,0,0,0,1,4.820,6.100,-1,0,2,-1,-1,1,1,-1,0,2
1115921,6.290,1,2,5,0,4,0,0,5.640,9.880,-1,0,2,-1,-1,0,0,-1,0,0
2414138,8.620,0,0,4,0,4,0,6,5.360,12.680,-1,-1,0,-1,1,1,2,-1,5,0
1624246,6.360,1,0,1,0,3,1,7,6.240,13.020,-1,-1,0,-1,-1,0,0,-1,2,1


In [4]:
X_val = pd.read_parquet('/kaggle/input/ps4e8-data-eng/val.parquet', engine='pyarrow')
dval = xgb.DMatrix(X_val, enable_categorical=True)

In [5]:
y_train

id
2413129    1
1910266    1
1213509    1
387249     1
2498558    0
          ..
2258055    1
1115921    0
2414138    0
1624246    0
740289     0
Name: class, Length: 2493556, dtype: int64

In [6]:
train_cols = X_train.columns
train_cols

Index(['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')

In [7]:
X_train.dtypes

cap-diameter            float64
cap-shape                 int64
cap-surface               int64
cap-color                 int64
does-bruise-or-bleed      int64
gill-attachment           int64
gill-spacing              int64
gill-color                int64
stem-height             float64
stem-width              float64
stem-root                 int64
stem-surface              int64
stem-color                int64
veil-type                 int64
veil-color                int64
has-ring                  int64
ring-type                 int64
spore-print-color         int64
habitat                   int64
season                    int64
dtype: object

# 2. Defining custom XGBoost metric
Since the contest uses a custom (not available in XGBoost) metric, we need to create it ourselves! The metric used is [Matthews correlation coefficient](https://en.wikipedia.org/wiki/Phi_coefficient) known in statistics as phi coefficient.

In [8]:
def mcc(predt, dtrain):
    return 'mcc', matthews_corrcoef(dtrain.get_label(), np.round(predt))

# 3. Choosing right hyperparameters for our model
Default XGBoost hyperparameters very often give good results but in order to optimize model's performance we have to choose good hyperparameters. Those hyperparameters were choosen with [xgboost-tuner.ipynb](./xgboost.ipynb).

In [9]:
params = {
    'eta': 0.005,
    'max_depth': 64,
    'min_child_weight': 4,
    'colsample_bytree': 0.5332244728838978, 
    'gamma': 0.7053913533793799,
    'alpha': 0.3058543405529223,
    'lambda': 0.009610147650215044,
    'scale_pos_weight': 0.827695469601273,
    "objective": 'binary:logistic',
    "random_state": RAND,
    "n_jobs": -1
}

# 4. Training the model
This is the most important step in every solution. In order to squeeze out and maximize our score we will set `num_boost_round` to a very high number while adding a callback [xgb.callback.EarlyStopping](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.callback.EarlyStopping) stops our model after its' validation score decreases for 500 boosting rounds.

In [10]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=9999999999,
    evals=[(dtest, 'test')],
    custom_metric=mcc,
    verbose_eval=50,
    callbacks=[
        xgb.callback.EarlyStopping(500, save_best=True, min_delta=1e-5, metric_name='mcc', data_name='test', maximize=True)
    ]
)

[0]	test-logloss:0.68848	test-mcc:0.95879
[50]	test-logloss:0.50708	test-mcc:0.98436
[100]	test-logloss:0.38368	test-mcc:0.98450
[150]	test-logloss:0.29639	test-mcc:0.98455
[200]	test-logloss:0.23394	test-mcc:0.98465
[250]	test-logloss:0.18680	test-mcc:0.98467
[300]	test-logloss:0.15179	test-mcc:0.98471
[350]	test-logloss:0.12485	test-mcc:0.98476
[400]	test-logloss:0.10419	test-mcc:0.98480
[450]	test-logloss:0.08848	test-mcc:0.98484
[500]	test-logloss:0.07621	test-mcc:0.98492
[550]	test-logloss:0.06672	test-mcc:0.98491
[600]	test-logloss:0.05955	test-mcc:0.98494
[650]	test-logloss:0.05398	test-mcc:0.98495
[700]	test-logloss:0.04957	test-mcc:0.98499
[750]	test-logloss:0.04619	test-mcc:0.98497
[800]	test-logloss:0.04362	test-mcc:0.98498
[850]	test-logloss:0.04163	test-mcc:0.98500
[900]	test-logloss:0.04013	test-mcc:0.98503
[950]	test-logloss:0.03896	test-mcc:0.98504
[1000]	test-logloss:0.03807	test-mcc:0.98507
[1050]	test-logloss:0.03740	test-mcc:0.98506
[1100]	test-logloss:0.03688	test-

In [11]:
def get_pred(X):
    return np.round(model.predict(X))

In [12]:
print(f"MCC score {matthews_corrcoef(y_test, get_pred(dtest))}")

Score 0.9852195173101069


In [13]:
pd.DataFrame(confusion_matrix(y_test, get_pred(dtest)))

Unnamed: 0,0,1
0,280402,1908
1,2660,338419


In [14]:
X_val

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3116945,8.640,1,-1,5,1,-1,-1,0,11.130,17.120,0,-1,0,0,1,1,5,-1,0,0
3116946,6.900,4,4,1,0,-1,0,4,1.270,10.750,-1,-1,2,-1,-1,0,0,-1,0,0
3116947,2.000,3,6,5,0,-1,0,1,6.180,3.140,-1,-1,2,-1,-1,0,0,-1,0,3
3116948,3.470,1,4,5,0,2,0,1,4.980,8.510,-1,-1,0,-1,0,1,1,-1,0,2
3116949,6.170,1,1,7,0,6,-1,4,6.730,13.700,-1,-1,3,-1,3,1,-1,-1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5194904,0.880,1,6,4,0,0,1,0,2.670,1.350,-1,-1,4,-1,-1,0,0,-1,0,2
5194905,3.120,1,0,4,0,3,0,0,2.690,7.380,-1,-1,0,-1,-1,0,0,-1,2,0
5194906,5.730,1,5,6,0,0,-1,0,6.160,9.740,-1,-1,3,-1,1,1,1,-1,0,0
5194907,5.030,3,6,5,0,0,1,2,6.000,3.460,-1,1,8,-1,-1,0,0,-1,0,0


In [15]:
prd = pd.DataFrame({"class" : get_pred(dval), "id" : X_val.index}).set_index('id').replace({0: 'e', 1: 'p'})
prd

Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
3116945,e
3116946,p
3116947,p
3116948,p
3116949,e
...,...
5194904,p
5194905,p
5194906,p
5194907,e


# 5. Generating submission files
In order to submit our results to the competition we need to generate submission file named `submission.csv`. We also generate additional files to use for meta modelling.

In [16]:
prd.to_csv('submission.csv', index=True)
sub = pd.read_csv('submission.csv', index_col='id')

assert sub.shape[0] == 2077964
sub.head()

Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
3116945,e
3116946,p
3116947,p
3116948,p
3116949,e


In [18]:
prd = pd.DataFrame({"xgb": get_pred(dtrain), "id": X_train.index}).set_index('id')
prd.to_parquet('xgbtrain_pred.parquet', index=True)
prd.head()

Unnamed: 0_level_0,xgb
id,Unnamed: 1_level_1
2413129,1.0
1910266,1.0
1213509,1.0
387249,1.0
2498558,0.0


In [20]:
prd = pd.DataFrame({"xgb": get_pred(dtest), "id": X_test.index}).set_index('id')
prd.to_parquet('xgbtest_pred.parquet', index=True)
prd.head()

Unnamed: 0_level_0,xgb
id,Unnamed: 1_level_1
395838,0.0
2786154,0.0
2083314,0.0
2580451,0.0
207829,1.0


In [22]:
prd = pd.DataFrame({"xgb": model.predict(dval), "id": X_val.index}).set_index('id')
prd.to_parquet('xgbval_pred.parquet', index=True)
prd.head()

Unnamed: 0_level_0,xgb
id,Unnamed: 1_level_1
3116945,0.003
3116946,0.996
3116947,0.993
3116948,0.996
3116949,0.002
