# 라이브러리 호출

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

train.fillna('NAN', inplace=True)
test.fillna('NAN', inplace=True)

# 전처리

In [3]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [4]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [5]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

# 모델링

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [7]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.649383	valid_1's multi_logloss: 0.764417
[200]	training's multi_logloss: 0.555442	valid_1's multi_logloss: 0.755226
Early stopping, best iteration is:
[245]	training's multi_logloss: 0.521443	valid_1's multi_logloss: 0.753747


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.645172	valid_1's multi_logloss: 0.775667
[200]	training's multi_logloss: 0.551025	valid_1's multi_logloss: 0.765431
[300]	training's multi_logloss: 0.477799	valid_1's multi_logloss: 0.762982
Early stopping, best iteration is:
[299]	training's multi_logloss: 0.478409	valid_1's multi_logloss: 0.762903


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.64773	valid_1's multi_logloss: 0.765103
[200]	training's multi_logloss: 0.551917	valid_1's multi_logloss: 0.758596
Early stopping, best iteration is:
[205]	training's multi_logloss: 0.

In [8]:
sample_submission.iloc[:,1:]=0
for fold in range(5):
    sample_submission.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [9]:
sample_submission

Unnamed: 0,index,0,1,2
0,26457,0.008515,0.223399,0.768086
1,26458,0.069156,0.114852,0.815992
2,26459,0.032709,0.195448,0.771844
3,26460,0.067637,0.188465,0.743897
4,26461,0.078516,0.250676,0.670808
...,...,...,...,...
9995,36452,0.042416,0.312763,0.644822
9996,36453,0.034296,0.381149,0.584556
9997,36454,0.039116,0.066311,0.894573
9998,36455,0.031437,0.243165,0.725398


In [10]:
# sample_submission.to_csv("0503_LGBM_ver1.csv", index = False)