In [1]:
import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold , StratifiedKFold
import lightgbm

In [2]:
train = pd.read_csv('D:/Kaggle/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('D:/Kaggle/tabular-playground-series-mar-2021/test.csv')
sub = pd.read_csv('D:/Kaggle/tabular-playground-series-mar-2021/sample_submission.csv')

In [3]:
cat = [col for col in train.columns if 'cat' in col]
cont = [col for col in test.columns if 'cont' in col]
all_features = cat + cont

In [4]:
[x for x in range(19)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [5]:
all_df = pd.concat([train , test]).reset_index(drop = True)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat:
    all_df[col] = le.fit_transform(all_df[col])

train = all_df[:train.shape[0]]
test = all_df[train.shape[0]:]

In [6]:
data = train[all_features]
target = train['target']

In [7]:
def objective(trial , data = data , target = target):
    train_x , test_x , train_y , test_y = train_test_split(data , target , \
            test_size = 0.028059109276941666 , random_state = 42)
    
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-5 , 10),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-5 , 10),
        'num_leaves' : trial.suggest_int('num_leaves' , 11 , 300),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.1),
        'max_depth' : trial.suggest_int('max_depth' , 5 , 20),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 100),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
        'subsample' : trial.suggest_uniform('subsample' , 0 , 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree' , 1e-5 , 1),
        'random_state' : trial.suggest_categorical('random_state' , [0,42,2021,555]),
        'metric' : 'auc',
        'device_type' : 'gpu',
    }
    model = lightgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 200 , \
             verbose = False)
    preds = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y , preds)
    return auc

In [11]:
params = {'reg_alpha': 4.203457823159052, 'reg_lambda': 6.34173530304477, 'num_leaves': 148,
 'min_child_samples': 55, 'max_depth': 16, 'learning_rate': 0.01, 'colsample_bytree': 0.22290988791359692,
 'n_estimators': 2703, 'cat_smooth': 37, 'cat_l2': 10, 'min_data_per_group': 97,
 'random_state': 26, 'cat_feature': [0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,
  15,  16,  17,  18], 'n_jobs': -1, 'boosting_type': 'gbdt', 'metric': 'AUC'}

In [15]:
preds = np.zeros(test.shape[0])
oof_preds = np.zeros(train.shape[0])
kf = StratifiedKFold(n_splits = 30 , random_state = 0 , shuffle = True)
roc = []
n = 0
for trn_idx , val_idx in kf.split(data , target):
    train_x = data.iloc[trn_idx]
    train_y = target.iloc[trn_idx]
    val_x = data.iloc[val_idx]
    val_y = target.iloc[val_idx]
    
    model = lightgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)], early_stopping_rounds = 200 ,
             verbose = False)
    preds += model.predict_proba(test[all_features])[:,1]/kf.n_splits
    oof_preds += model.predict_proba(data[all_features])[:,1]/kf.n_splits
    roc.append(roc_auc_score(val_y , model.predict_proba(val_x)[:,1]))
    print(n+1 , roc[n])
    n+=1

1 0.8976450496237536
2 0.8963186744096598
3 0.8954935295161888
4 0.8985807857121724
5 0.8995563305214851
6 0.9014764463869501
7 0.9003504502957043
8 0.8971815733803654
9 0.8934800814625119
10 0.898205388963343
11 0.8959691927185198
12 0.8947750654643947
13 0.9024335433236897
14 0.8950173521536375
15 0.8975365072067966
16 0.8950415397396133
17 0.8976524124754742
18 0.8984357615974982
19 0.8991995963189472
20 0.8955153801977825
21 0.9005652962370743
22 0.901867677060704
23 0.8969120463884005
24 0.8955881997241384
25 0.8971689688780745
26 0.8942803240561359
27 0.8983971436258342
28 0.8963174734150956
29 0.8984040250197212
30 0.8977287754061446


In [16]:
sub['target'] = preds
sub.to_csv('slgbmsubmission.csv' , index = False)

In [17]:
output = pd.DataFrame({'id':train['id'] , 'target':oof_preds})
output.to_csv('slgbmoof_predictions.csv' , index = False)