### Library

In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

### Data Load & Preprocessing

In [2]:
train = pd.read_csv('dataset/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True)

test = pd.read_csv('dataset/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('dataset/sample_submission.csv')

In [3]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [4]:
enc = OneHotEncoder()
enc.fit(train.loc[:, object_col])

train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,
                                                       object_col]).toarray(),
                               columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [5]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:, object_col]).toarray(),
                              columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

### Training

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [7]:
random.seed(42)
lgb_models = {}
for fold in range(5):
    print(
        f'===================================={fold+1}============================================'
    )
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            early_stopping_rounds=50,
            verbose=100)
    lgb_models[fold] = lgb
    print(
        f'================================================================================\n\n'
    )

Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.649825	valid_1's multi_logloss: 0.754342
[200]	training's multi_logloss: 0.565196	valid_1's multi_logloss: 0.738368
[300]	training's multi_logloss: 0.503465	valid_1's multi_logloss: 0.731678
[400]	training's multi_logloss: 0.451777	valid_1's multi_logloss: 0.730029
Early stopping, best iteration is:
[429]	training's multi_logloss: 0.438088	valid_1's multi_logloss: 0.729175


Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.646021	valid_1's multi_logloss: 0.764365
[200]	training's multi_logloss: 0.560632	valid_1's multi_logloss: 0.751211
[300]	training's multi_logloss: 0.497513	valid_1's multi_logloss: 0.748437
Early stopping, best iteration is:
[330]	training's multi_logloss: 0.480923	valid_1's multi_logloss: 0.747248


Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.650293	valid_1's multi_logloss: 0.

### Test inference

In [8]:
submit.iloc[:, 1:] = 0
for fold in range(5):
    submit.iloc[:, 1:] += lgb_models[fold].predict_proba(test) / 5

In [9]:
submit.to_csv('./submit/submit_20210407_01.csv', index=False)

In [10]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.047811,0.105655,0.846533
1,26458,0.246816,0.141146,0.612038
2,26459,0.038839,0.104809,0.856351
3,26460,0.103354,0.131802,0.764844
4,26461,0.099892,0.174582,0.725526
5,26462,0.050025,0.128024,0.821951
6,26463,0.485439,0.514189,0.000372
7,26464,0.117118,0.144094,0.738789
8,26465,0.073126,0.1298,0.797073
9,26466,0.061997,0.264591,0.673412
