In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-feb-2022/train.csv
/kaggle/input/tabular-playground-series-feb-2022/test.csv


In [2]:
import warnings
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

warnings.filterwarnings('ignore')

In [3]:
def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    acc = accuracy_score(y, y_pred)
    return {'accuracy' : acc}

In [4]:
seed = 47

In [5]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')

In [6]:
train_df.drop(['row_id'], axis=1, inplace=True)
target = train_df.pop('target')

In [7]:
le = LabelEncoder()
target = le.fit_transform(target)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=seed)

# XGBoost Baseline

In [9]:
def get_xgb_baseline(params={}):
    return XGBClassifier(**params,
                         random_state=seed,
                         tree_method='gpu_hist',
                         predictor='gpu_predictor',
                         eval_metric='auc',
                         objective='multi:softprob')

In [10]:
model = get_xgb_baseline()
model.fit(x_train, y_train)
results = evaluate_model(model, x_test, y_test)
print(results)

{'accuracy': 0.983425}


# Feature Engineering

Here wee will experiment creating synthetic features using central tendency statistics.


In [11]:
geomean = lambda x, axis : np.exp(np.mean(np.log(x), axis=axis))
harmonic_mean = lambda x, axis : len(x) / np.sum(1.0/x, axis=axis) 

funcs = {'mean' : np.mean, 
         'std' : np.std, 
         'var' : np.var, 
         'geo_mean' : geomean, 
         'harmonic_mean' : harmonic_mean, 
         'median' : np.median,
         'none': None}

In [12]:
results, names = list(), list()

for key in funcs.keys():
    x_train = train_df.copy()
    if funcs[key] is not None:
        x_train[key] = funcs[key](x_train, axis=1)
    x_train, x_test, y_train, y_test = train_test_split(x_train, target, test_size=0.2, random_state=seed)       
    model = get_xgb_baseline()

    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    names.append(key)
    results.append(result['accuracy'])
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index_best = np.argmax(score)
print('Best result is of:', names[index_best], 'with score:', results[index_best])

>mean: 0.984450
>std: 0.984950
>var: 0.984950
>geo_mean: 0.984850
>harmonic_mean: 0.984600
>median: 0.984325
>none: 0.983425
Best result is of: mean with score: 0.98445


# Individual parameter Search

In [13]:
x_train = train_df.copy()
x_train['mean'] = np.mean(x_train, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_train, target, test_size=0.2, random_state=seed)

In [14]:
params = {}

<h2>1 - Testing different number of estimators</h2>


In [15]:
results_trees = {}
trees = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 950, 975, 1000, 1025, 1050, 1100, 1150, 1290, 1295, 1300, 1305, 1310, 1315, 1325, 2000, 3000, 4000, 5000, 10000]

for n in trees:
    params['n_estimators'] = n
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_trees[n] = result['accuracy']
    print('n_estimators:', n, 'accuracy:', results_trees[n])

best_nestimator = max(results_trees, key=results_trees.get)
print('\nBest n_estimators:', best_nestimator, 'Accuracy score:', results_trees[best_nestimator])

n_estimators: 50 accuracy: 0.966325
n_estimators: 100 accuracy: 0.98445
n_estimators: 150 accuracy: 0.990275
n_estimators: 200 accuracy: 0.993025
n_estimators: 250 accuracy: 0.99405
n_estimators: 300 accuracy: 0.994575
n_estimators: 350 accuracy: 0.994775
n_estimators: 400 accuracy: 0.995125
n_estimators: 450 accuracy: 0.995175
n_estimators: 500 accuracy: 0.9951
n_estimators: 600 accuracy: 0.995175
n_estimators: 700 accuracy: 0.99515
n_estimators: 800 accuracy: 0.99515
n_estimators: 900 accuracy: 0.995025
n_estimators: 950 accuracy: 0.995
n_estimators: 975 accuracy: 0.995
n_estimators: 1000 accuracy: 0.994975
n_estimators: 1025 accuracy: 0.995025
n_estimators: 1050 accuracy: 0.994975
n_estimators: 1100 accuracy: 0.995025
n_estimators: 1150 accuracy: 0.995
n_estimators: 1290 accuracy: 0.995075
n_estimators: 1295 accuracy: 0.99495
n_estimators: 1300 accuracy: 0.995025
n_estimators: 1305 accuracy: 0.995125
n_estimators: 1310 accuracy: 0.995025
n_estimators: 1315 accuracy: 0.99505
n_estima

<h2>2 - Testing different max_depth</h2>

In [16]:
results_max_depths = {}
params['n_estimators'] = best_nestimator
max_depths = [i for i in range(1,10)]

for max_depth in max_depths:
    params['max_depth'] = max_depth
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_max_depths[max_depth] = result['accuracy']
    print('max_depth:', max_depth, 'accuracy:', results_max_depths[max_depth])

best_max_depth = max(results_max_depths, key=results_max_depths.get)
print('\nBest max_depth:', best_max_depth, 'Accuracy score:', results_max_depths[best_max_depth])

max_depth: 1 accuracy: 0.9179
max_depth: 2 accuracy: 0.9846
max_depth: 3 accuracy: 0.9947
max_depth: 4 accuracy: 0.9952
max_depth: 5 accuracy: 0.995375
max_depth: 6 accuracy: 0.9954
max_depth: 7 accuracy: 0.9952
max_depth: 8 accuracy: 0.995425
max_depth: 9 accuracy: 0.995275

Best max_depth: 8 Accuracy score: 0.995425


<h2>3 - Testing different subsamples</h2>


In [17]:
results_subsamples = {}
params['max_depth'] = best_max_depth
subsamples = [i for i in np.arange(0.1, 1.1, 0.1)]

for subsample in subsamples:
    params['subsample'] = subsample
    model = model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_subsamples[subsample] = result['accuracy']
    print('subsample:', subsample, 'accuracy:', results_subsamples[subsample])

best_subsample = max(results_subsamples, key=results_subsamples.get)
print('\nBest subsample:', best_subsample, 'Accuracy score:', results_subsamples[best_subsample])

subsample: 0.1 accuracy: 0.994325
subsample: 0.2 accuracy: 0.9945
subsample: 0.30000000000000004 accuracy: 0.99445
subsample: 0.4 accuracy: 0.99505
subsample: 0.5 accuracy: 0.9951
subsample: 0.6 accuracy: 0.995275
subsample: 0.7000000000000001 accuracy: 0.995525
subsample: 0.8 accuracy: 0.9951
subsample: 0.9 accuracy: 0.995425
subsample: 1.0 accuracy: 0.995425

Best subsample: 0.7000000000000001 Accuracy score: 0.995525


<h2>4 - Testing different learning rates</h2>

In [18]:
results_etas = {}
params['subsample'] = best_subsample
etas = [0.0001, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.12, 0.13, 0.3, 0.5, 1.0]

for eta in etas:
    params['eta'] = eta
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_etas[eta] = result['accuracy']
    print('eta:', eta, 'accuracy:', results_etas[eta])

best_eta = max(results_etas, key=results_etas.get)
print('\nBest eta:', best_eta, 'Accuracy score:', results_etas[best_eta])

eta: 0.0001 accuracy: 0.87385
eta: 0.001 accuracy: 0.938775
eta: 0.003 accuracy: 0.96735
eta: 0.005 accuracy: 0.980775
eta: 0.01 accuracy: 0.991925
eta: 0.03 accuracy: 0.99575
eta: 0.05 accuracy: 0.995725
eta: 0.1 accuracy: 0.9958
eta: 0.12 accuracy: 0.995725
eta: 0.13 accuracy: 0.995725
eta: 0.3 accuracy: 0.995525
eta: 0.5 accuracy: 0.99485
eta: 1.0 accuracy: 0.99405

Best eta: 0.1 Accuracy score: 0.9958


<h2>5 - Testing different number of features</h2>

In [19]:
results_colsample_bytrees = {}
params['eta'] = best_eta
colsample_bytrees = [i for i in np.arange(0.1, 1.1, 0.1)]

for colsample_bytree in colsample_bytrees:
    params['colsample_bytree'] = colsample_bytree
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_colsample_bytrees[colsample_bytree] = result['accuracy']
    print('colsample_bytree:', colsample_bytree, ':', results_colsample_bytrees[colsample_bytree])

best_colsample_bytree = max(results_colsample_bytrees, key=results_colsample_bytrees.get)
print('\nBest colsample_bytree:', best_colsample_bytree, 'Accuracy score:', results_colsample_bytrees[best_colsample_bytree])

colsample_bytree: 0.1 : 0.99545
colsample_bytree: 0.2 : 0.995475
colsample_bytree: 0.30000000000000004 : 0.995625
colsample_bytree: 0.4 : 0.995475
colsample_bytree: 0.5 : 0.99565
colsample_bytree: 0.6 : 0.995725
colsample_bytree: 0.7000000000000001 : 0.995675
colsample_bytree: 0.8 : 0.9956
colsample_bytree: 0.9 : 0.995375
colsample_bytree: 1.0 : 0.9958

Best colsample_bytree: 1.0 Accuracy score: 0.9958


<h2>6 - Testing different values for min_child_weight</h2>

In [20]:
results_min_child_weight = {}
params['colsample_bytree'] = best_colsample_bytree
min_child_weights = [i for i in range(1,10)]

for min_child_weight in min_child_weights:
    params['min_child_weight'] = min_child_weight
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_min_child_weight[min_child_weight] = result['accuracy']
    print('min_child_weight:', min_child_weight, 'accuracy:', results_min_child_weight[min_child_weight])

best_min_child_weight = max(results_min_child_weight, key=results_min_child_weight.get)
print('\nBest min_child_weight:', best_min_child_weight, 'Accuracy score:', results_min_child_weight[best_min_child_weight])

min_child_weight: 1 accuracy: 0.9958
min_child_weight: 2 accuracy: 0.99535
min_child_weight: 3 accuracy: 0.99545
min_child_weight: 4 accuracy: 0.995425
min_child_weight: 5 accuracy: 0.995375
min_child_weight: 6 accuracy: 0.995175
min_child_weight: 7 accuracy: 0.99535
min_child_weight: 8 accuracy: 0.995375
min_child_weight: 9 accuracy: 0.995175

Best min_child_weight: 1 Accuracy score: 0.9958


<h2>7 - Testing different values for gamma</h2>

In [21]:
results_gamma = {}
params['min_child_weight'] = best_min_child_weight
gammas = [0.01, 0.02, 0.03, 0.1, 0.3, 0.5, 1, 1.1, 1.5, 2, 5, 7, 9, 10]

for gamma in gammas:
    params['gamma'] = gamma
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_gamma[gamma] = result['accuracy']
    print('gamma:', gamma, 'accuracy:', results_gamma[gamma])

best_gamma = max(results_gamma, key=results_gamma.get)
print('\nBest gamma:', best_gamma, 'Accuracy score:', results_gamma[best_gamma])

gamma: 0.01 accuracy: 0.995375
gamma: 0.02 accuracy: 0.9954
gamma: 0.03 accuracy: 0.99535
gamma: 0.1 accuracy: 0.99545
gamma: 0.3 accuracy: 0.994875
gamma: 0.5 accuracy: 0.994175
gamma: 1 accuracy: 0.9921
gamma: 1.1 accuracy: 0.991275
gamma: 1.5 accuracy: 0.989175
gamma: 2 accuracy: 0.9867
gamma: 5 accuracy: 0.97315
gamma: 7 accuracy: 0.966125
gamma: 9 accuracy: 0.960375
gamma: 10 accuracy: 0.957125

Best gamma: 0.1 Accuracy score: 0.99545


In [22]:
params['gamma'] = best_gamma
print('Best Hyperparameters:', params)

Best Hyperparameters: {'n_estimators': 2000, 'max_depth': 8, 'subsample': 0.7000000000000001, 'eta': 0.1, 'colsample_bytree': 1.0, 'min_child_weight': 1, 'gamma': 0.1}


In [23]:
params.pop('gamma')

0.1

# Submission

In [24]:
model = get_xgb_baseline(params)
model.fit(x_train, y_train)
result = evaluate_model(model, x_test, y_test)
print(result)

{'accuracy': 0.9958}


In [25]:
del x_train, x_test, y_train, y_test, train_df
gc.collect()

475

In [26]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')

In [27]:
ids = test_df.pop('row_id').values
test_df['mean'] = np.mean(test_df, axis=1)
target = le.inverse_transform(model.predict(test_df))
submission = pd.DataFrame({'row_id' : ids, 'target' : target})

In [28]:
submission.head()

Unnamed: 0,row_id,target
0,200000,Escherichia_fergusonii
1,200001,Salmonella_enterica
2,200002,Enterococcus_hirae
3,200003,Salmonella_enterica
4,200004,Staphylococcus_aureus


In [29]:
submission.to_csv('submission.csv', index=False)