In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
import os
from sklearn.metrics import log_loss

In [2]:
d = "C:\kaggle_data\credit_card"
lst = os.listdir(d)
print(lst)
train = pd.read_csv(d + '\\' +lst[3])
test = pd.read_csv(d + '\\' +lst[2])
ss = pd.read_csv(d + '\\' +lst[1])
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

['.ipynb_checkpoints', 'sample_submission.csv', 'test.csv', 'train.csv', 'Untitled.ipynb']


# 전처리

In [3]:
train['income_per_size'] = np.log(train['income_total']/train['family_size'])
test['income_per_size'] = np.log(test['income_total']/test['family_size'])

In [4]:
# income_total을 로그변환 한 새로운 feature log_income
# 기존 칼럼 삭제
train['log_income'] = np.log(train['income_total'])
train = train.drop('income_total',1)
test['log_income'] = np.log(test['income_total'])
test = test.drop('income_total',1)

In [5]:
# car와 reality를 합친 새로운 칼럼 careality
train['careality'] = train['car'] + train['reality']
train = train.drop(['car', 'reality'],1)

test['careality'] = test['car'] + test['reality']
test = test.drop(['car', 'reality'],1)

In [6]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [7]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [8]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

# 모델링, 학습

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
losses=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    losses.append(log_loss(y_valid, lgb.predict_proba(X_valid)))
    print(f'================================================================================\n\n')
print(sum(losses)/5)

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.647434	valid_1's multi_logloss: 0.749897
[200]	training's multi_logloss: 0.558814	valid_1's multi_logloss: 0.732487
[300]	training's multi_logloss: 0.49624	valid_1's multi_logloss: 0.724517
Early stopping, best iteration is:
[324]	training's multi_logloss: 0.483225	valid_1's multi_logloss: 0.724221


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.643482	valid_1's multi_logloss: 0.762857
[200]	training's multi_logloss: 0.556453	valid_1's multi_logloss: 0.74847
[300]	training's multi_logloss: 0.491826	valid_1's multi_logloss: 0.743238
Early stopping, best iteration is:
[297]	training's multi_logloss: 0.493313	valid_1's multi_logloss: 0.742947


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.644599	valid_1's multi_logloss: 0.755309
[200]	training's multi_logloss: 0.554875	valid_1's multi_logloss: 0.74

In [13]:
ss.iloc[:,1:]=0
for fold in range(5):
    ss.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [14]:
ss.to_csv('baseline_income_per_size.csv', index=False)

In [15]:
# log_income을 사용했을 때의 점수
from sklearn.model_selection import cross_val_score

print(cross_val_score(lgb_models[0], train.drop(['credit'],axis=1), train['credit'], cv=skf).mean())

0.7069584926727785


# LOOCV

In [13]:
from sklearn.model_selection import LeaveOneOut

In [14]:
n = len(train)

In [None]:
from sklearn.model_selection import KFold 
kf = KFold(n_splits=n)
folds=[]
losses=[]
prediction = []
for train_idx, valid_idx in kf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))
random.seed(42)
lgb_models={}
for fold in range(n):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, verbose=100)
    lgb_models[fold]=lgb
    prediction.append(lgb.predict_proba(X_valid))
    print(f'================================================================================\n\n')
# print(sum(losses)/n)



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































