In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt
import optuna
import gc

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-oct-2021/train.csv
/kaggle/input/tabular-playground-series-oct-2021/test.csv


In [2]:
from xgboost import XGBClassifier

In [3]:
train_df = dt.fread('/kaggle/input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test_df = dt.fread('/kaggle/input/tabular-playground-series-oct-2021/test.csv').to_pandas()
sample_df = dt.fread('/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()

Similarly to the September TPS, I've chosen datatable to import the dataframes, as it should be a bit faster.

In [4]:
train_df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,...,False,True,False,False,False,False,False,False,False,True
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,False,True,False,False,False,False,False,False,False,True
2,2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,False,False,False,True,True,False,False,False,False,True
3,3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,...,False,False,False,False,True,False,False,False,False,True
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,False,True,True,False,True,False,False,True,False,True


In [5]:
train_df.shape

(1000000, 287)

In [6]:
print(f'train_df shape: {train_df.shape}')
print(f'NaNs in train_df: {train_df.isna().sum().sum()}\n')
print(f'test_df shape: {test_df.shape}')
print(f'NaNs in test_df: {test_df.isna().sum().sum()}\n')
print(f'sample_df shape: {sample_df.shape}')
print(f'NaNs in sample_df: {sample_df.isna().sum().sum()}')

train_df shape: (1000000, 287)
NaNs in train_df: 0

test_df shape: (500000, 286)
NaNs in test_df: 0

sample_df shape: (500000, 2)
NaNs in sample_df: 0


In [7]:
'''A function to reduce the amount of memory taken up by each feature by compressing it to the appropriate datatype
verbose parameter is used to output a message regarding the exact memory usage reduction'''
def reduce_memory_usage(df, verbose=True):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2 #initial memory usage to compare to
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            #extract the min and max values
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                #else:
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2 #new memory_usage
    if verbose:
        print(
            "Memory usage decreased to: {:.2f} Mb - {:.1f}% reduction".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
                
                )
            )
    return df

In [8]:
for i, col in enumerate(train_df.columns):
    if train_df[col].dtypes == bool:
        train_df[col] = train_df[col].astype(int)

In [9]:
for i, col in enumerate(test_df.columns):
    if test_df[col].dtypes == bool:
        test_df[col] = test_df[col].astype(int)

In [10]:
train_df.dtypes

id          int32
f0        float64
f1        float64
f2        float64
f3        float64
           ...   
f281        int64
f282        int64
f283        int64
f284        int64
target      int64
Length: 287, dtype: object

In [11]:
test_df.dtypes

id        int32
f0      float64
f1      float64
f2      float64
f3      float64
         ...   
f280      int64
f281      int64
f282      int64
f283      int64
f284      int64
Length: 286, dtype: object

In [12]:
print("X_train reduction:")
train_df = reduce_memory_usage(train_df)
print("X_test reduction:")
test_df = reduce_memory_usage(test_df)

X_train reduction:
Memory usage decreased to: 505.45 Mb - 76.9% reduction
X_test reduction:
Memory usage decreased to: 252.25 Mb - 76.8% reduction


In [13]:
X_train = train_df.drop(['id', 'target'], axis=1)
y_train = train_df['target'].copy()

X_test = test_df.drop('id', axis=1)

In [14]:
del train_df
del test_df
gc.collect()

46

{
    'max_depth': trial.suggest_categorical('max_depth', [4, 6, 8, 10, 15, 20]),
    'n_estimators': trial.suggest_categorical('max_depth', [5000, 10000,15000]),
    'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.2,0.4,0.6,0.8,1.0]),
    'colsample_bylevel': trial.suggest_categorical('colsample_bytree', [0.2,0.4,0.6,0.8,1.0]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
    'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 100.0),
    'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
    'gamma': trial.suggest_float('gamma', 1, 100),
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'use_label_encoder': False
}

In [15]:
def objective(trial,data=X_train,target=y_train):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'max_depth': trial.suggest_int('max_depth',3,15),
        'n_estimators': trial.suggest_categorical('n_estimators', [4000, 5000, 6000]),
        'subsample': trial.suggest_float('subsample',0.15,0.995,log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.15,0.995,log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'gamma': trial.suggest_categorical('gamma',[0, 0.25, 0.5, 1.0]),
        'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),
        'gpu_id': trial.suggest_categorical('gpu_id',[0]),
        'predictor' : trial.suggest_categorical('predictor',['gpu_predictor']),
        'random_state': trial.suggest_categorical('random_state',[42]),
        'booster': trial.suggest_categorical('booster',['gbtree']),
        'eval_metric': trial.suggest_categorical('eval_metric',['auc']),
        'use_label_encoder': trial.suggest_categorical('use_label_encoder',[False]),
    }
    model = XGBClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    roc_auc = roc_auc_score(test_y, preds)
    
    return roc_auc

In [16]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-10-11 20:34:13,359][0m A new study created in memory with name: no-name-d9eb9b3d-46ab-47ef-b1ff-a3ffda13d828[0m
  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2021-10-11 20:34:39,497][0m Trial 0 finished with value: 0.7609915048107565 and parameters: {'max_depth': 15, 'n_estimators': 5000, 'subsample': 0.2482153370185433, 'colsample_bytree': 0.36650294150046697, 'min_child_weight': 91, 'reg_lambda': 0.004453142076939559, 'reg_alpha': 0.12606092531031238, 'gamma': 0, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor', 'random_state': 42, 'booster': 'gbtree', 'eval_metric': 'auc', 'use_label_encoder': False}. Best is trial 0 with value: 0.7609915048107565.[0m
[32m[I 2021-10-11 20:35:11,534][0m Trial 1 finished with value: 0.759335019532762 and parameters: {'max_depth': 12, 'n_estimators': 4000, 'subsample': 0.3287521665448665, 'colsample_bytree': 0.86177009800134, 'min_child_weight': 30, 'reg_lambda': 0.0015479221728121738, 'reg_alpha': 8.902088467038178, 'gamma': 0, 'tree_method': 'gpu_hist', 'gpu_id': 0, 'predictor': 'gpu_predictor', 'random_state': 42, 'booster': 'gbtree', 'eval_metric': 'auc', 'use_label_encod

In [17]:
params= study.best_trial.params

In [18]:
x_tra, x_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=42)

In [19]:
del X_train
del y_train
gc.collect()

69

In [20]:
model = XGBClassifier(**params)  

model.fit(x_tra,y_tra,eval_set=[(x_val,y_val)], eval_metric='auc',early_stopping_rounds=100,verbose=250)

[0]	validation_0-auc:0.55561
[250]	validation_0-auc:0.85533
[490]	validation_0-auc:0.85548


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.21738638591202727,
              eval_metric='auc', gamma=1.0, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=3, min_child_weight=75, missing=nan,
              monotone_constraints='()', n_estimators=4000, n_jobs=2,
              num_parallel_tree=1, predictor='gpu_predictor', random_state=42,
              reg_alpha=0.023425483016642896, reg_lambda=0.02462118401099498,
              scale_pos_weight=1, subsample=0.7139778464770262,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [21]:
y_pred = model.predict_proba(X_test)

In [22]:
sample_df['target'] = y_pred[:,1]

In [23]:
sample_df.to_csv('submission_72.csv', index=False)