In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
import os
from sklearn.metrics import log_loss

In [2]:
d = "C:\kaggle_data\credit_card"
lst = os.listdir(d)
print(lst)
train = pd.read_csv(d + '\\' +lst[3])
test = pd.read_csv(d + '\\' +lst[2])
ss = pd.read_csv(d + '\\' +lst[1])
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

['.ipynb_checkpoints', 'sample_submission.csv', 'test.csv', 'train.csv', 'Untitled.ipynb']


# 전처리

### family_type
- 'Married', 'Civil marriage'
- 'Separated',  'Widow'
- 'Single / not married'

In [3]:
# Married, Civil marriage
train['income_per_size'] = np.log(train['income_total']/train['family_size'])
test['income_per_size'] = np.log(test['income_total']/test['family_size'])
train.loc[(train['family_type']=='Married')|(train['family_type']=='Civil marriage'),'income_per_size']\
= train['income_per_size'] * 2

test.loc[(test['family_type']=='Married')|(test['family_type']=='Civil marriage'),'income_per_size']\
= test['income_per_size'] * 2

In [4]:
def simple_marry(x):
    if x == 'Married' or x =='Civil marriage':
        return '0'
    elif x == 'Separated' or x == 'Widow':
        return '1'
    else:
        return '2'

In [5]:
# 안 덮어쓰기
for df in [train,test]:
    df['family_bins'] = df['family_type'].apply(simple_marry)

In [6]:
# income_total을 로그변환 한 새로운 feature log_income
# 기존 칼럼 삭제
train['log_income'] = np.log(train['income_total'])
train = train.drop('income_total',1)
test['log_income'] = np.log(test['income_total'])
test = test.drop('income_total',1)

In [7]:
# car와 reality를 합친 새로운 칼럼 careality
train['car'] =train['car'].apply(lambda x: int(x=='Y'))
train['reality'] =train['reality'].apply(lambda x: int(x=='Y'))
test['car'] =test['car'].apply(lambda x: int(x=='Y'))
test['reality'] =test['reality'].apply(lambda x: int(x=='Y'))

train['careality'] = train['car'] + train['reality']
train = train.drop(['car', 'reality'],1)

test['careality'] = test['car'] + test['reality']
test = test.drop(['car', 'reality'],1)

In [8]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [9]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [10]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

### 
- income_total
- begin_month
- DAYS_BIRTH

In [11]:
# c = 'income_total'
# k = 2.2
# mean = train[c].mean()
# std = train[c].std()
# idxs = train.loc[(train[c]>= mean + k*std)|\
#                 (train[c]<= mean - k*std)].index
# train = train.drop(idxs).reset_index(drop=True)

In [15]:
# out: 아웃라이어
out_train = train.loc[train.DAYS_EMPLOYED>0]
in_train = train.loc[train.DAYS_EMPLOYED<=0]


out_test = test.loc[test.DAYS_EMPLOYED>0]
in_test = test.loc[test.DAYS_EMPLOYED<=0]

<!-- ###
- DAYS_EMPLOYED -->

# 모델링, 학습

In [16]:
out_train = out_train.reset_index()
in_train = in_train.reset_index()

out_test = out_test.reset_index()
in_test = in_test.reset_index()

idx_out_train = out_train.pop('index')
idx_out_test = out_test.pop('index')

idx_in_train = in_train.pop('index')
idx_in_test = in_test.pop('index')

out_train.drop('DAYS_EMPLOYED',inplace=True,axis=1)
out_test.drop('DAYS_EMPLOYED',inplace=True,axis=1)

In [17]:
n = 10

In [18]:
def tr(train):
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=42)
    folds=[]
    losses=[]
    for train_idx, valid_idx in skf.split(train, train['credit']):
        folds.append((train_idx, valid_idx))
    random.seed(42)
    lgb_models={}
    for fold in range(n):
        train_idx, valid_idx = folds[fold]
        X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                             train['credit'][train_idx].values, train['credit'][valid_idx].values 
        lgb = LGBMClassifier(n_estimators=1000)
        lgb.fit(X_train, y_train, 
                eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                early_stopping_rounds=30)
        lgb_models[fold]=lgb
        losses.append(log_loss(y_valid, lgb.predict_proba(X_valid)))
    ls = sum(losses)/n
    print('평균{ls}')
    return ls, lgb_models

In [19]:
out_loss, out_models = tr(out_train)

[1]	training's multi_logloss: 0.856892	valid_1's multi_logloss: 0.859591
Training until validation scores don't improve for 30 rounds
[2]	training's multi_logloss: 0.832345	valid_1's multi_logloss: 0.838393
[3]	training's multi_logloss: 0.812914	valid_1's multi_logloss: 0.826085
[4]	training's multi_logloss: 0.796487	valid_1's multi_logloss: 0.814487
[5]	training's multi_logloss: 0.782601	valid_1's multi_logloss: 0.806546
[6]	training's multi_logloss: 0.769915	valid_1's multi_logloss: 0.799696
[7]	training's multi_logloss: 0.758574	valid_1's multi_logloss: 0.793876
[8]	training's multi_logloss: 0.748874	valid_1's multi_logloss: 0.790049
[9]	training's multi_logloss: 0.738978	valid_1's multi_logloss: 0.783482
[10]	training's multi_logloss: 0.728735	valid_1's multi_logloss: 0.776063
[11]	training's multi_logloss: 0.719633	valid_1's multi_logloss: 0.771396
[12]	training's multi_logloss: 0.710839	valid_1's multi_logloss: 0.76608
[13]	training's multi_logloss: 0.704135	valid_1's multi_loglo

[35]	training's multi_logloss: 0.592755	valid_1's multi_logloss: 0.76072
[36]	training's multi_logloss: 0.589724	valid_1's multi_logloss: 0.760664
[37]	training's multi_logloss: 0.586094	valid_1's multi_logloss: 0.760071
[38]	training's multi_logloss: 0.582691	valid_1's multi_logloss: 0.759397
[39]	training's multi_logloss: 0.57979	valid_1's multi_logloss: 0.760172
[40]	training's multi_logloss: 0.576253	valid_1's multi_logloss: 0.759815
[41]	training's multi_logloss: 0.572711	valid_1's multi_logloss: 0.759635
[42]	training's multi_logloss: 0.569595	valid_1's multi_logloss: 0.759945
[43]	training's multi_logloss: 0.566677	valid_1's multi_logloss: 0.759337
[44]	training's multi_logloss: 0.563428	valid_1's multi_logloss: 0.760051
[45]	training's multi_logloss: 0.560441	valid_1's multi_logloss: 0.760567
[46]	training's multi_logloss: 0.556924	valid_1's multi_logloss: 0.761277
[47]	training's multi_logloss: 0.554396	valid_1's multi_logloss: 0.761072
[48]	training's multi_logloss: 0.551899	

[72]	training's multi_logloss: 0.484658	valid_1's multi_logloss: 0.729145
[73]	training's multi_logloss: 0.482593	valid_1's multi_logloss: 0.72739
[74]	training's multi_logloss: 0.480254	valid_1's multi_logloss: 0.727203
[75]	training's multi_logloss: 0.478174	valid_1's multi_logloss: 0.727684
[76]	training's multi_logloss: 0.475936	valid_1's multi_logloss: 0.727989
[77]	training's multi_logloss: 0.473585	valid_1's multi_logloss: 0.72703
[78]	training's multi_logloss: 0.471547	valid_1's multi_logloss: 0.726947
[79]	training's multi_logloss: 0.46931	valid_1's multi_logloss: 0.727082
[80]	training's multi_logloss: 0.46705	valid_1's multi_logloss: 0.726155
[81]	training's multi_logloss: 0.464494	valid_1's multi_logloss: 0.725621
[82]	training's multi_logloss: 0.462006	valid_1's multi_logloss: 0.725047
[83]	training's multi_logloss: 0.459828	valid_1's multi_logloss: 0.72678
[84]	training's multi_logloss: 0.457647	valid_1's multi_logloss: 0.727
[85]	training's multi_logloss: 0.45542	valid_1

[2]	training's multi_logloss: 0.832814	valid_1's multi_logloss: 0.844246
[3]	training's multi_logloss: 0.813603	valid_1's multi_logloss: 0.832335
[4]	training's multi_logloss: 0.797233	valid_1's multi_logloss: 0.821042
[5]	training's multi_logloss: 0.782527	valid_1's multi_logloss: 0.811279
[6]	training's multi_logloss: 0.769854	valid_1's multi_logloss: 0.803037
[7]	training's multi_logloss: 0.757045	valid_1's multi_logloss: 0.796903
[8]	training's multi_logloss: 0.745824	valid_1's multi_logloss: 0.792028
[9]	training's multi_logloss: 0.735465	valid_1's multi_logloss: 0.78738
[10]	training's multi_logloss: 0.726026	valid_1's multi_logloss: 0.784709
[11]	training's multi_logloss: 0.716915	valid_1's multi_logloss: 0.781272
[12]	training's multi_logloss: 0.707899	valid_1's multi_logloss: 0.776588
[13]	training's multi_logloss: 0.70069	valid_1's multi_logloss: 0.773091
[14]	training's multi_logloss: 0.693631	valid_1's multi_logloss: 0.77099
[15]	training's multi_logloss: 0.686086	valid_1's

[10]	training's multi_logloss: 0.721375	valid_1's multi_logloss: 0.810012
[11]	training's multi_logloss: 0.71261	valid_1's multi_logloss: 0.80824
[12]	training's multi_logloss: 0.704916	valid_1's multi_logloss: 0.805875
[13]	training's multi_logloss: 0.696898	valid_1's multi_logloss: 0.801725
[14]	training's multi_logloss: 0.689554	valid_1's multi_logloss: 0.798247
[15]	training's multi_logloss: 0.683249	valid_1's multi_logloss: 0.795942
[16]	training's multi_logloss: 0.67665	valid_1's multi_logloss: 0.794022
[17]	training's multi_logloss: 0.670218	valid_1's multi_logloss: 0.791469
[18]	training's multi_logloss: 0.663907	valid_1's multi_logloss: 0.790191
[19]	training's multi_logloss: 0.659052	valid_1's multi_logloss: 0.789466
[20]	training's multi_logloss: 0.653527	valid_1's multi_logloss: 0.787666
[21]	training's multi_logloss: 0.648538	valid_1's multi_logloss: 0.787617
[22]	training's multi_logloss: 0.643498	valid_1's multi_logloss: 0.785117
[23]	training's multi_logloss: 0.639134	v

[33]	training's multi_logloss: 0.602399	valid_1's multi_logloss: 0.738586
[34]	training's multi_logloss: 0.59833	valid_1's multi_logloss: 0.735843
[35]	training's multi_logloss: 0.593799	valid_1's multi_logloss: 0.735775
[36]	training's multi_logloss: 0.590132	valid_1's multi_logloss: 0.736099
[37]	training's multi_logloss: 0.586532	valid_1's multi_logloss: 0.736244
[38]	training's multi_logloss: 0.582194	valid_1's multi_logloss: 0.737594
[39]	training's multi_logloss: 0.578927	valid_1's multi_logloss: 0.736826
[40]	training's multi_logloss: 0.574719	valid_1's multi_logloss: 0.736032
[41]	training's multi_logloss: 0.571529	valid_1's multi_logloss: 0.735165
[42]	training's multi_logloss: 0.568768	valid_1's multi_logloss: 0.735277
[43]	training's multi_logloss: 0.565082	valid_1's multi_logloss: 0.734914
[44]	training's multi_logloss: 0.561564	valid_1's multi_logloss: 0.73531
[45]	training's multi_logloss: 0.559286	valid_1's multi_logloss: 0.734856
[46]	training's multi_logloss: 0.556299	

[51]	training's multi_logloss: 0.545168	valid_1's multi_logloss: 0.69382
[52]	training's multi_logloss: 0.541948	valid_1's multi_logloss: 0.692405
[53]	training's multi_logloss: 0.538977	valid_1's multi_logloss: 0.691502
[54]	training's multi_logloss: 0.535837	valid_1's multi_logloss: 0.690486
[55]	training's multi_logloss: 0.533095	valid_1's multi_logloss: 0.690057
[56]	training's multi_logloss: 0.530144	valid_1's multi_logloss: 0.689893
[57]	training's multi_logloss: 0.52757	valid_1's multi_logloss: 0.691355
[58]	training's multi_logloss: 0.524507	valid_1's multi_logloss: 0.689698
[59]	training's multi_logloss: 0.521574	valid_1's multi_logloss: 0.688706
[60]	training's multi_logloss: 0.519074	valid_1's multi_logloss: 0.687736
[61]	training's multi_logloss: 0.516113	valid_1's multi_logloss: 0.685123
[62]	training's multi_logloss: 0.513829	valid_1's multi_logloss: 0.6845
[63]	training's multi_logloss: 0.511441	valid_1's multi_logloss: 0.683976
[64]	training's multi_logloss: 0.508894	va

[31]	training's multi_logloss: 0.608127	valid_1's multi_logloss: 0.731352
[32]	training's multi_logloss: 0.603562	valid_1's multi_logloss: 0.72999
[33]	training's multi_logloss: 0.599238	valid_1's multi_logloss: 0.73143
[34]	training's multi_logloss: 0.595907	valid_1's multi_logloss: 0.729863
[35]	training's multi_logloss: 0.592088	valid_1's multi_logloss: 0.729403
[36]	training's multi_logloss: 0.587769	valid_1's multi_logloss: 0.72786
[37]	training's multi_logloss: 0.584095	valid_1's multi_logloss: 0.728027
[38]	training's multi_logloss: 0.581093	valid_1's multi_logloss: 0.728043
[39]	training's multi_logloss: 0.577702	valid_1's multi_logloss: 0.726971
[40]	training's multi_logloss: 0.573524	valid_1's multi_logloss: 0.726093
[41]	training's multi_logloss: 0.570704	valid_1's multi_logloss: 0.725162
[42]	training's multi_logloss: 0.567327	valid_1's multi_logloss: 0.724579
[43]	training's multi_logloss: 0.56451	valid_1's multi_logloss: 0.724392
[44]	training's multi_logloss: 0.561144	va

[71]	training's multi_logloss: 0.490804	valid_1's multi_logloss: 0.734931
[72]	training's multi_logloss: 0.488056	valid_1's multi_logloss: 0.734835
[73]	training's multi_logloss: 0.485327	valid_1's multi_logloss: 0.734598
[74]	training's multi_logloss: 0.482423	valid_1's multi_logloss: 0.734566
[75]	training's multi_logloss: 0.480258	valid_1's multi_logloss: 0.734643
[76]	training's multi_logloss: 0.477906	valid_1's multi_logloss: 0.734253
[77]	training's multi_logloss: 0.475269	valid_1's multi_logloss: 0.734081
[78]	training's multi_logloss: 0.472967	valid_1's multi_logloss: 0.733375
[79]	training's multi_logloss: 0.470861	valid_1's multi_logloss: 0.73406
[80]	training's multi_logloss: 0.468765	valid_1's multi_logloss: 0.735068
[81]	training's multi_logloss: 0.466536	valid_1's multi_logloss: 0.734389
[82]	training's multi_logloss: 0.464284	valid_1's multi_logloss: 0.733651
[83]	training's multi_logloss: 0.46254	valid_1's multi_logloss: 0.734316
[84]	training's multi_logloss: 0.460546	

In [20]:
in_loss, in_models = tr(in_train)

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.62038	valid_1's multi_logloss: 0.744209
[200]	training's multi_logloss: 0.521963	valid_1's multi_logloss: 0.729235
Early stopping, best iteration is:
[266]	training's multi_logloss: 0.47498	valid_1's multi_logloss: 0.725871


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.625355	valid_1's multi_logloss: 0.737295
[200]	training's multi_logloss: 0.526116	valid_1's multi_logloss: 0.72286
Early stopping, best iteration is:
[218]	training's multi_logloss: 0.511477	valid_1's multi_logloss: 0.721449


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.615785	valid_1's multi_logloss: 0.755351
[200]	training's multi_logloss: 0.517087	valid_1's multi_logloss: 0.743465
Early stopping, best iteration is:
[261]	training's multi_logloss: 0.471513	valid_1's multi_logloss: 0.741379


Training until validation scores d

In [142]:
in_zeros = np.zeros([len(in_test),3])
for fold in range(5):
    in_zeros += in_models[fold].predict_proba(in_test)/5
in_output = pd.DataFrame(in_zeros)
in_output = in_output.reindex(index=pd.Index(idx_in_test))

In [186]:
in2 = np.concatenate((in_zeros,np.array(idx_in_test).reshape(-1,1)),axis=1)
out2 = np.concatenate((out_zeros,np.array(idx_out_test).reshape(-1,1)),axis=1)

In [187]:
output = pd.DataFrame(np.concatenate((in2,out2),axis=0),columns=[0,1,2,'index'])

In [188]:
output['index'] = output['index'].astype('int')

In [196]:
ss = pd.read_csv(d + '\\' +lst[1])

In [199]:
output

Unnamed: 0,0,1,2,index
0,0.254328,0.176459,0.569213,1
1,0.037702,0.077119,0.885179,2
2,0.150319,0.105418,0.744263,3
3,0.074441,0.168475,0.757084,4
4,0.057947,0.112467,0.829586,5
...,...,...,...,...
9995,0.045256,0.062820,0.891924,9968
9996,0.113289,0.091170,0.795541,9981
9997,0.162660,0.185026,0.652315,9988
9998,0.231756,0.133885,0.634359,9992


In [200]:
for i in range(len(output)):
    row = output.loc[output['index']==i]
    ss.iloc[i,1:] = row.iloc[:,:3]

In [204]:
ss.to_csv('two_models.csv', index=False)

In [205]:
output.head()

Unnamed: 0,0,1,2,index
0,0.254328,0.176459,0.569213,1
1,0.037702,0.077119,0.885179,2
2,0.150319,0.105418,0.744263,3
3,0.074441,0.168475,0.757084,4
4,0.057947,0.112467,0.829586,5


In [206]:
output.tail()

Unnamed: 0,0,1,2,index
9995,0.045256,0.06282,0.891924,9968
9996,0.113289,0.09117,0.795541,9981
9997,0.16266,0.185026,0.652315,9988
9998,0.231756,0.133885,0.634359,9992
9999,0.083838,0.910228,0.005934,9994
