In [639]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
from IPython.core.display import display, HTML
from sklearn.preprocessing import MinMaxScaler, RobustScaler

display(HTML("<style>.container { width: 100% !important; }</style>"))

In [640]:
train = pd.read_csv('./open/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv('./open/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('./open/sample_submission.csv')

In [641]:
data=pd.concat([train, test], axis=0)
data.shape

(36457, 19)

In [642]:
train=train.drop(['FLAG_MOBIL'], axis=1)
test=test.drop(['FLAG_MOBIL'], axis=1)

In [643]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)
#     elif col in ['phone',  'email', 'work_phone', 'FLAG_MOBIL']:
#         object_col.append(col)

In [644]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [645]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [646]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [647]:
def minmax(df):
    df = np.array(df).reshape(-1, 1)
    result = MinMaxScaler().fit_transform(df)
    return result

In [648]:
def days_to_age(x):
    return (x*-1)/365

In [649]:
# 마이너스 값 변환
def minus(x):
    return x * -1

In [650]:
train['income_total'] = train['income_total']/10000
train['income_total_dev'] = (train['income_total'] - train['income_total'].mean())**2
train['income_total_log'] = train['income_total'].apply(np.log)
#train['income_total_minmax'] = train['income_total'].apply(minmax)

In [651]:
train['career'] = train['DAYS_EMPLOYED'].apply( lambda x : 1 if x < 0 else 0)
train['DAYS_EMPLOYED_log'] = train['DAYS_EMPLOYED'].map(lambda x: x if x < 0 else 0).apply(lambda x: np.log1p(x*-1))
train.loc[train['DAYS_EMPLOYED'] >= 0,'DAYS_EMPLOYED']=0
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(days_to_age)
# train['DAYS_EMPLOYED_one'] = train['DAYS_EMPLOYED']
# train.loc[train['DAYS_EMPLOYED_one'] > 0,'DAYS_EMPLOYED']=0
#train['DAYS_EMPLOYED_dev'] = (train['DAYS_EMPLOYED'] - train['DAYS_EMPLOYED'].mean())**2

In [652]:
train['begin_month'] = train['begin_month'].apply(minus)
#train['begin_month'] = train['begin_month']//12
# train.loc[train['begin_month'] < 0,'begin_month']=0
# train.loc[train['begin_month'] > 0,'begin_month']=1

In [653]:
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].apply(days_to_age)

In [654]:
train['diff_child'] = train['family_size'] - train['child_num']
train.loc[train['diff_child'] < 0,'diff_child']=0
train.loc[train['diff_child'] > 0,'diff_child']=1
train.loc[train['child_num'] >= 2,'child_num'] = 2
train.loc[train['family_size'] >= 5,'child_num'] = 5

In [655]:
#train['total_phone'] = train['work_phone'] + train['phone']

In [656]:
test['income_total'] = test['income_total']/10000
test['income_total_dev'] = (test['income_total'] - test['income_total'].mean())**2
test['income_total_log'] = test['income_total'].apply(np.log)
#test['income_total_minmax'] = test['income_total'].apply(minmax)

In [657]:
test['career'] = test['DAYS_EMPLOYED'].apply( lambda x : 1 if x < 0 else 0)
test['DAYS_EMPLOYED_log'] = test['DAYS_EMPLOYED'].map(lambda x: x if x < 0 else 0).apply(lambda x: np.log1p(x*-1))
test.loc[test['DAYS_EMPLOYED'] >= 0,'DAYS_EMPLOYED']=0
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].apply(days_to_age)
# test['DAYS_EMPLOYED_one'] = test['DAYS_EMPLOYED']
# test.loc[test['DAYS_EMPLOYED_one'] > 0,'DAYS_EMPLOYED']=0
# test.loc[test['DAYS_EMPLOYED_one'] < 0,'DAYS_EMPLOYED']=1
#test.loc[test['DAYS_EMPLOYED'] >= 0,'DAYS_EMPLOYED']=0
#test['DAYS_EMPLOYED_dev'] = (test['DAYS_EMPLOYED'] - test['DAYS_EMPLOYED'].mean())**2

In [658]:
test['DAYS_BIRTH'] = test['DAYS_BIRTH'].apply(days_to_age)

In [659]:
test['diff_child'] = test['family_size'] - test['child_num']
test.loc[test['diff_child'] < 0,'diff_child']=0
test.loc[test['diff_child'] > 0,'diff_child']=1
test.loc[test['child_num'] >= 2,'child_num'] = 2
test.loc[test['family_size'] >= 5,'child_num'] = 5

In [660]:
test['begin_month'] = test['begin_month'].apply(minus)
#test['begin_month'] = test['begin_month']//12
# test.loc[train['begin_month'] < 0,'begin_month']=0
# test.loc[train['begin_month'] > 0,'begin_month']=1

In [661]:
#test['total_phone'] = test['work_phone'] + test['phone']

In [662]:
#minus 변경하고
#구간화 함수
def make_bin(df, variable, n):
    data = df
    #data[variable] =- data[variable]
    count, bin_dividers = np.histogram(data[variable], bins=n)
    bin_names=[str(i) for i in range(n)]
    data['%s_bin' % variable]=pd.cut(x=data[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    

In [663]:
#make_bin(train, 'income_total', n=7)
make_bin(train, 'income_total_log', n=25)
make_bin(train, 'DAYS_BIRTH', n=10)
# #make_bin(train, 'begin_month', n=6)
make_bin(train, 'DAYS_EMPLOYED_log', n=20)
#make_bin(train, 'DAYS_EMPLOYED', n=2)
#make_bin(train, 'child_num', n=2)

In [664]:
#make_bin(test, 'income_total', n=7)
make_bin(test, 'income_total_log', n=25)
make_bin(test, 'DAYS_BIRTH', n=10)
# #make_bin(test, 'begin_month', n=6)
make_bin(test, 'DAYS_EMPLOYED_log', n=20)
#make_bin(train, 'DAYS_EMPLOYED', n=2)
#make_bin(train, 'child_num', n=2)

In [665]:
enc = OneHotEncoder()
object_col = ['income_total_log_bin', 'DAYS_BIRTH_bin',  'DAYS_EMPLOYED_log_bin']
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [666]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)
train = train.drop(['income_total'], axis=1)

In [667]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)
test = test.drop(['income_total'], axis=1)

In [668]:
train.shape, test.shape

((26457, 109), (10000, 108))

In [669]:
from bayes_opt import BayesianOptimization
import xgboost as xgb
import catboost as cat

In [670]:
dtrain = xgb.DMatrix(train.drop(['credit'],axis=1), train['credit'])

In [671]:
def xgb_evaluate(max_depth, subsample, colsample_bytree, learning_rate):
    params = {'eval_metric': 'mlogloss',
                'objective': 'multi:softprob',
                'gpu_id': 0,
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor',
                #                 'booster' : 'dart',
                'num_class' : 3,
                'max_depth': int(max_depth),
                'subsample': subsample,
                'eta': learning_rate,
                'colsample_bytree': colsample_bytree,   
                #                 'rate_drop': rate_drop,
                #               'max_delta_step':max_delta_step
             }
    # Used around 1000 boosting rounds in the full model
    cv_result = xgb.cv(params, dtrain, num_boost_round=200, nfold=5, early_stopping_rounds=50)    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-mlogloss-mean'].iloc[-1]

In [122]:
xgb_bo = BayesianOptimization(xgb_evaluate, {
                                'max_depth': (4, 12),
                                'subsample': (0.4, 1.0),
                                'colsample_bytree' :(0.4, 1.0),
#                                 'rate_drop': (0.1, 0.5),
                                'learning_rate': (0.01,0.2)})
# Use the expected improvement acquisition function to handle negative numbers
# Optimally needs quite a few more initiation points and number of iterations
xgb_bo.maximize(init_points=5, n_iter = 5, acq='ei', xi=0.01, random_state=409)

|   iter    |  target   | colsam... | learni... | max_depth | subsample |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.7692  [0m | [0m 0.4027  [0m | [0m 0.1534  [0m | [0m 4.077   [0m | [0m 0.689   [0m |
| [95m 2       [0m | [95m-0.75    [0m | [95m 0.5543  [0m | [95m 0.1028  [0m | [95m 6.942   [0m | [95m 0.4016  [0m |
| [95m 3       [0m | [95m-0.7367  [0m | [95m 0.8786  [0m | [95m 0.1856  [0m | [95m 7.435   [0m | [95m 0.8552  [0m |
| [0m 4       [0m | [0m-0.7399  [0m | [0m 0.8083  [0m | [0m 0.02837 [0m | [0m 10.22   [0m | [0m 0.6205  [0m |
| [0m 5       [0m | [0m-0.7378  [0m | [0m 0.4955  [0m | [0m 0.08542 [0m | [0m 7.395   [0m | [0m 0.5738  [0m |
| [95m 6       [0m | [95m-0.7286  [0m | [95m 0.5128  [0m | [95m 0.07296 [0m | [95m 8.259   [0m | [95m 0.8375  [0m |
| [95m 7       [0m | [95m-0.7167  [0m | [95m 0.4376  [0m | [95m 0.05131 [0m | [95m 12.0    [

In [581]:
params = xgb_bo.max['params']
params

{'colsample_bytree': 0.437585287361525,
 'learning_rate': 0.051310789411336766,
 'max_depth': 11.99880897321103,
 'subsample': 0.9687212370190348}

In [672]:
params['max_depth'] = 12
params['eval_metric'] = 'mlogloss'
params['objective'] = 'multi:softprob'
params['num_class'] = 3
params['subsample'] = 0.8
params['colsample_bytree'] = 0.3
params['min_child_weight'] = 1.1
params['learning_rate'] = 0.04
params['alpha'] = 0
params['gamma'] = 0
params

{'colsample_bytree': 0.3,
 'learning_rate': 0.04,
 'max_depth': 12,
 'subsample': 0.8,
 'eval_metric': 'mlogloss',
 'objective': 'multi:softprob',
 'num_class': 3,
 'min_child_weight': 1.1,
 'alpha': 0,
 'gamma': 0}

In [None]:
model = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=50,
    verbose_eval = 50
)
model

In [638]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from tensorflow.keras.utils import to_categorical

train_x = train.drop(['credit'],axis=1)
train_y = train['credit']
dtest2 = xgb.DMatrix(test)

def run_kfold():
    folds=StratifiedKFold(n_splits=10, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test.shape[0], 3))  
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
        X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        
        dtrain = xgb.DMatrix(X_train, y_train)
        dtest = xgb.DMatrix(X_val)
#         clf.fit(X_train, y_train)
        
#         predictions=clf.predict_proba(X_val)
        
        final_gb = xgb.train(params, dtrain, num_boost_round=350, verbose_eval=50)

        predictions = final_gb.predict(dtest)
        test_predictions = final_gb.predict(dtest2)
        
        logloss=log_loss(to_categorical(y_val), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+= test_predictions
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

my_submission = run_kfold()

FOLD 0 : logloss:0.7026823425689432
FOLD 1 : logloss:0.6923813601485534
FOLD 2 : logloss:0.6837733398773795
FOLD 3 : logloss:0.7029009338599247
FOLD 4 : logloss:0.7143285191879775
FOLD 5 : logloss:0.67391279505394
FOLD 6 : logloss:0.6757419324773544
FOLD 7 : logloss:0.6850064412354437
FOLD 8 : logloss:0.704275892658726
FOLD 9 : logloss:0.6834853166356922
Mean:0.6918488873703935


In [177]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [178]:
clf1 = xgb.XGBClassifier(colsample_bytree = 0.4, subsample=0.98, learning_rate=0.09, 
                         objective='multi:softprob', max_depth=10)
clf2 = CatBoostClassifier()

In [179]:
clf = StackingClassifier(estimators=[('rf', clf1), ('lgbm', clf2)], #모델 합치기
    final_estimator=LogisticRegression(),
                        n_jobs = -1, 
                        stack_method = 'predict_proba',
                        cv = 5)
# rf랑 lgb로부터 나온 예측값에 가중치를 주어서 새로운 파이널 모델에 넣고 재학습. 


In [55]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from tensorflow.keras.utils import to_categorical
def run_kfold(clf):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test.shape[0], 3))  
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
        X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        
#         dtrain = xgb.DMatrix(X_train, y_train)
#         dtest = xgb.DMatrix(X_val)
        clf.fit(X_train, y_train)
        
        predictions=clf.predict_proba(X_val)
        
        #final_gb = xgb.train(params, dtrain, num_boost_round=200, verbose_eval=50)

        #predictions = final_gb.predict(dtest)
        
        logloss=log_loss(to_categorical(y_val['credit']), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=clf.predict_proba(test)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

my_submission = run_kfold(clf)

FOLD 0 : logloss:0.7365110742085077


KeyboardInterrupt: 

In [183]:
my_submission

array([[0.04819474, 0.09246179, 0.85934347],
       [0.2432932 , 0.23357293, 0.52313386],
       [0.05332712, 0.08692874, 0.85974413],
       ...,
       [0.02024993, 0.07179929, 0.90795078],
       [0.13012759, 0.24752395, 0.62234845],
       [0.06288433, 0.21253935, 0.72457634]])

In [184]:
submission=pd.read_csv('./open/sample_submission.csv')
submission.loc[:,1:]=my_submission
submission

Unnamed: 0,index,0,1,2
0,26457,0.048195,0.092462,0.859343
1,26458,0.243293,0.233573,0.523134
2,26459,0.053327,0.086929,0.859744
3,26460,0.109893,0.117643,0.772465
4,26461,0.066512,0.127067,0.806421
...,...,...,...,...
9995,36452,0.115962,0.301663,0.582375
9996,36453,0.224034,0.301054,0.474912
9997,36454,0.020250,0.071799,0.907951
9998,36455,0.130128,0.247524,0.622348


In [185]:
submission.to_csv('./submit/10fold_xgb_0.6958.csv', index=False) # 0.7272812144

In [186]:
submission.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.048195,0.092462,0.859343
1,26458,0.243293,0.233573,0.523134
2,26459,0.053327,0.086929,0.859744
3,26460,0.109893,0.117643,0.772465
4,26461,0.066512,0.127067,0.806421
5,26462,0.045072,0.08272,0.872208
6,26463,0.620859,0.355267,0.023874
7,26464,0.086823,0.109106,0.80407
8,26465,0.051528,0.163646,0.784826
9,26466,0.058361,0.329386,0.612253


In [74]:
submission=pd.read_csv('./submit/10fold_xgb_0.6974.csv') # 0.7272812144
submission[submission['index']==31375]

Unnamed: 0,index,0,1,2
4918,31375,0.0582,0.072123,0.869677
