In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, f1_score, accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# 신규 데이터 로딩

trn = pd.read_csv('./input/train_append_lb_lag.csv').fillna(0)
target = pd.DataFrame(pickle.load(open('./input/target.pkl','rb')), columns=['target'])
tst = pd.read_csv('./input/test_append_lb_lag.csv').fillna(0)
print(trn.shape, target.shape, tst.shape)

(45619, 246) (45619, 1) (929615, 246)


In [3]:
# 빈도가 낮은 타겟은 사전에 제거 (이유: 교차 검증에 활용할 수 없음 + 너무 빈도가 낮아 무의미함)
rem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23]  # 18 classes
trn = trn[target['target'].isin(rem_targets)]
target = target[target['target'].isin(rem_targets)]
target = LabelEncoder().fit_transform(target)

for t in np.unique(target):
    print(t, sum(target==t))

  y = column_or_1d(y, warn=True)


0 9452
1 1934
2 55
3 349
4 222
5 154
6 503
7 33
8 1085
9 1219
10 246
11 21
12 2942
13 4733
14 159
15 5151
16 8218
17 9119


In [4]:
cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
        'ind_cder_fin_ult1', 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']

print(trn.shape, tst.shape)

# 타겟별 누적 합
lags = ['_lag_one','_lag_two','_lag_thr','_lag_fou','_lag_fiv']
for col in cols:
    trn[col+'_sum'] = trn[[col+lag for lag in lags]].sum(axis=1)
    tst[col+'_sum'] = tst[[col+lag for lag in lags]].sum(axis=1)
    
# 월별 누적 합
for lag in lags:
    trn['sum'+lag] = trn[[col+lag for col in cols]].sum(axis=1)
    tst['sum'+lag] = tst[[col+lag for col in cols]].sum(axis=1)
    
print(trn.shape, tst.shape)

(45595, 246) (929615, 246)
(45595, 275) (929615, 275)


In [None]:
import xgboost as xgb
# XGB Model Param
num_round = 500
early_stop = 10
xgb_params = {
    'booster': 'gbtree',
    
    # 모델 복잡도
    'max_depth': 2, # 높을 수록 복잡
    #'gamma': 3,    # 낮을 수록 복잡
    #'min_child_weight': 5, # 낮을 수록 복잡

    # 랜덤 샘플링을 통한 정규화
    #'colsample_bylevel': 0.7,
    #'colsample_bytree': 1,
    #'subsample': 0.8,

    # 정규화
    #'reg_alpha': 2,
    #'reg_lambda': 3,

    # 학습 속도
    #'learning_rate': 0.03,
    
    # 기본 설정
    'nthread': 4,
    'num_class': 18,
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'seed': 777,
}

def evaluate_xgb(x, y):
    trn_scores = dict(); vld_scores = dict()
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=777)
    for t_ind, v_ind in sss.split(x,y):
        # split data
        x_trn, x_vld = x.iloc[t_ind], x.iloc[v_ind]
        y_trn, y_vld = y[t_ind], y[v_ind]

        dtrn = xgb.DMatrix(x_trn, label=y_trn)
        dvld = xgb.DMatrix(x_vld, label=y_vld)
        watch_list = [(dtrn, 'train'), (dvld, 'eval')]

        # fit xgb
        bst = xgb.train(xgb_params, dtrn, num_round, watch_list, \
                        early_stopping_rounds=early_stop, verbose_eval=True)
        
        # eval _ trn        
        preds = bst.predict(dtrn)

        log_scores = trn_scores.get('log loss', [])
        log_scores.append(log_loss(y_trn, preds))
        trn_scores['log loss'] = log_scores

        # eval _ vld
        preds = bst.predict(dvld)
        
        log_scores = vld_scores.get('log loss', [])
        log_scores.append(log_loss(y_vld, preds))
        vld_scores['log loss'] = log_scores
    return trn_scores, vld_scores

def print_scores(trn_scores, vld_scores):
    prefix = '        '
    cols = ['log loss']
    print('='*50)
    print('TRAIN EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
        print('# {} Raw  : {}'.format(prefix, trn_scores[col]))

    print('='*50)
    print('VALID EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
        print('# {} Raw  : {}'.format(prefix, vld_scores[col]))

def print_time(end, start):
    print('='*50)
    elapsed = end - start
    print('{} secs'.format(round(elapsed)))
    
def fit_and_eval(trn, target, model):
    trn_scores, vld_scores = evaluate(trn,target,model)
    print_scores(trn_scores, vld_scores)
    print_time(time.time(), st)    

evaluate_xgb(trn,target)

[0]	train-mlogloss:1.92287	eval-mlogloss:1.92563
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.6923	eval-mlogloss:1.69613
[2]	train-mlogloss:1.54877	eval-mlogloss:1.55233
[3]	train-mlogloss:1.4416	eval-mlogloss:1.4449
[4]	train-mlogloss:1.36517	eval-mlogloss:1.36938
[5]	train-mlogloss:1.30456	eval-mlogloss:1.30978
[6]	train-mlogloss:1.25836	eval-mlogloss:1.26402
[7]	train-mlogloss:1.21343	eval-mlogloss:1.21655
[8]	train-mlogloss:1.18181	eval-mlogloss:1.18392
[9]	train-mlogloss:1.15589	eval-mlogloss:1.15929
[10]	train-mlogloss:1.13339	eval-mlogloss:1.13554
[11]	train-mlogloss:1.11469	eval-mlogloss:1.11696
[12]	train-mlogloss:1.09895	eval-mlogloss:1.10173
[13]	train-mlogloss:1.08592	eval-mlogloss:1.0881
[14]	train-mlogloss:1.07444	eval-mlogloss:1.07763
[15]	train-mlogloss:1.06405	eval-mlogloss:1.06658
[16]	train-mlogloss:1.05526	eval-mlogloss:1.05797
[17]	train-ml

({'log loss': [0.90535043611532529, 0.91830840965971972, 0.89676387589100792]},
 {'log loss': [0.95190776783483833, 0.96732326325262985, 0.96716711426461666]})

In [None]:
# XGBoost 기반 결과물 생성 코드
from datetime import datetime
import os

print('='*50)
print('# Test shape : {}'.format(tst.shape))

# 최종 모델 정의 및 학습 실행
dtrn = xgb.DMatrix(trn, label= target)
num_round = 115 # 평가 함수 기반 최적의 num_round 수치 지정
bst = xgb.train(xgb_params, dtrn, num_round, verbose_eval=True)

dtst = xgb.DMatrix(tst)
preds = bst.predict(dtst)
preds = np.fliplr(np.argsort(preds, axis=1))

cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
        'ind_cder_fin_ult1', 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']
target_cols = [cols[i] for i, col in enumerate(cols) if i in rem_targets]

final_preds = []
for pred in preds:
    top_products = []
    for i, product in enumerate(pred):
        top_products.append(target_cols[product])
        if i == 6:
            break
    final_preds.append(' '.join(top_products))

temp = pd.read_csv('./input/test_clean.csv')
test_id = temp['ncodpers']
out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
out_df.to_csv(os.path.join('./output',file_name), index=False)

# Test shape : (929615, 275)
