In [1]:
import gc
import time
from datetime import datetime, timedelta,date
import warnings
import itertools
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection,johnson_lindenstrauss_min_dim
from sklearn.decomposition import PCA, FastICA,NMF,LatentDirichletAllocation,IncrementalPCA,MiniBatchSparsePCA
from sklearn.decomposition import TruncatedSVD,FactorAnalysis,KernelPCA

from tqdm import tqdm
import seaborn as sns
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.model_selection import StratifiedKFold, KFold

from scipy.stats import ks_2samp
from functools import wraps
import functools
#settings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score

In [2]:
def time_decorator(func):
    
    @wraps(func)
    def wrapper(*args, **kwargs):
        print("\nStartTime: ", datetime.now() + timedelta(hours=9))
        start_time = time.time()
        
        df = func(*args, **kwargs)
        
        print("EndTime: ", datetime.now() + timedelta(hours=9))  
        print("TotalTime: ", time.time() - start_time)
        return df
        
    return wrapper

class SklearnWrapper(object):
    def __init__(self, clf, params=None, **kwargs):
        """
        params['random_state'] = kwargs.get('seed', 0)
        self.clf = clf(**params)
        self.is_classification_problem = True
        """
        if 'random_state' in params:
            params['random_state'] = kwargs.get('seed', 0)
        self.clf = clf(**params)
        self.is_classification_problem = True
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        if len(np.unique(y_train)) > 30:
            self.is_classification_problem = False
            
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        if self.is_classification_problem is True:
            return self.clf.predict_proba(x)[:,1]
        else:
            return self.clf.predict(x)
    

class CatboostWrapper(object):
    def __init__(self, params=None, **kwargs):
        """
        seed
        num_rounds
        ealry_stopping
        eval_function
        verbose_eval
        """
        try:
            if params is None:
                raise("Parameter를 입력하세요!!")
            self.param = params
            seed = kwargs.get('seed', None)
            
            if seed is not None:
                self.param['random_seed'] = seed
                
            num_rounds = kwargs.get('num_rounds', None)
            if num_rounds is not None:
                self.param['num_boost_round'] = num_rounds
            
            early_stopping = kwargs.get('ealry_stopping', None)
            if early_stopping is not None:
                self.param['early_stopping_rounds'] = early_stopping
            
            eval_function = kwargs.get('eval_function', None)
            if eval_function is not None:
                self.param['eval_metric'] = eval_function
            
            verbose_eval = kwargs.get('verbose_eval', 100)
            if verbose_eval is not None:
                self.param['verbose'] = verbose_eval
                
            self.best_round = 0
            
            self.is_classification_problem = True
        except BaseException as e:
            print(e)
            
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        """
        x_cross or y_cross is None
        -> model train limted num_rounds
        
        x_cross and y_cross is Not None
        -> model train using validation set
        """
        if isinstance(y_train, pd.DataFrame) is True:
            y_train = y_train[y_train.columns[0]]
            if y_cross is not None:
                y_cross = y_cross[y_cross.columns[0]]

        if x_cross is None:
            train_round = self.clf.tree_count_
            if self.best_round > 0:
                train_round = self.best_round
            
            self.param['iterations'] = train_round
            self.clf = cb.CatBoostClassifier(**self.param)
            self.clf.fit(x_train, y_train, use_best_model=True)
        else:
            self.clf = cb.CatBoostClassifier(**self.param)
            self.clf.fit(x_train, y_train,
                         eval_set=[(x_cross, y_cross)],
                         use_best_model=True)
            self.best_round = max(self.best_round, self.clf.tree_count_)
            
        gc.collect()
    
    def predict(self, x):
        if self.is_classification_problem is True:
            return self.clf.predict_proba(x)[:,1]
        else:
            return self.clf.predict(x)
        
    def get_params(self):
        return self.param
    
    
class XgbWrapper(object):
    def __init__(self, params=None, **kwargs):
        self.param = params
        self.param['seed'] = kwargs.get('seed', 0)
        self.num_rounds = kwargs.get('num_rounds', 1000)
        self.early_stopping = kwargs.get('ealry_stopping', 100)

        self.eval_function = kwargs.get('eval_function', None)
        self.verbose_eval = kwargs.get('verbose_eval', 100)
        self.best_round = 0
    
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        need_cross_validation = True
        
        if isinstance(y_train, pd.DataFrame) is True:
            y_train = y_train[y_train.columns[0]]
            if y_cross is not None:
                y_cross = y_cross[y_cross.columns[0]]
                

        if x_cross is None:
            dtrain = xgb.DMatrix(x_train, label=y_train, silent= True)
            train_round = self.best_round
            if self.best_round == 0:
                train_round = self.num_rounds
            
            print(train_round)
            self.clf = xgb.train(self.param, dtrain, train_round)
            del dtrain
        else:
            dtrain = xgb.DMatrix(x_train, label=y_train, silent=True)
            dvalid = xgb.DMatrix(x_cross, label=y_cross, silent=True)
            watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

            self.clf = xgb.train(self.param, dtrain, self.num_rounds, watchlist, feval=self.eval_function,
                                 early_stopping_rounds=self.early_stopping,
                                 verbose_eval=self.verbose_eval)
            self.best_round = max(self.best_round, self.clf.best_iteration)

    def predict(self, x):
        return self.clf.predict(xgb.DMatrix(x), ntree_limit=self.best_round)

    def get_params(self):
        return self.param
    
    
class LgbmWrapper(object):
    def __init__(self, params=None, **kwargs):
        self.param = params
        self.param['seed'] = kwargs.get('seed', 0)
        self.num_rounds = kwargs.get('num_rounds', 1000)
        self.early_stopping = kwargs.get('ealry_stopping', 100)

        self.eval_function = kwargs.get('eval_function', None)
        self.verbose_eval = kwargs.get('verbose_eval', 100)
        self.best_round = 0
        
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        """
        x_cross or y_cross is None
        -> model train limted num_rounds
        
        x_cross and y_cross is Not None
        -> model train using validation set
        """
        if isinstance(y_train, pd.DataFrame) is True:
            y_train = y_train[y_train.columns[0]]
            if y_cross is not None:
                y_cross = y_cross[y_cross.columns[0]]

        if x_cross is None:
            dtrain = lgb.Dataset(x_train, label=y_train, silent= True)
            train_round = self.best_round
            if self.best_round == 0:
                train_round = self.num_rounds
                
            self.clf = lgb.train(self.param, train_set=dtrain, num_boost_round=train_round)
            del dtrain   
        else:
            dtrain = lgb.Dataset(x_train, label=y_train, silent=True)
            dvalid = lgb.Dataset(x_cross, label=y_cross, silent=True)
            self.clf = lgb.train(self.param, train_set=dtrain, num_boost_round=self.num_rounds, valid_sets=[dtrain, dvalid],
                                  feval=self.eval_function, early_stopping_rounds=self.early_stopping,
                                  verbose_eval=self.verbose_eval)
            self.best_round = max(self.best_round, self.clf.best_iteration)
            del dtrain, dvalid
            
        gc.collect()
    
    def predict(self, x):
        return self.clf.predict(x, num_iteration=self.clf.best_iteration)
    
    def plot_importance(self):
        lgb.plot_importance(self.clf, max_num_features=50, height=0.7, figsize=(10,30))
        plt.show()
        
    def get_params(self):
        return self.param

In [3]:
@time_decorator
def get_oof(clf, x_train, y_train, x_test, eval_func, **kwargs):
    nfolds = kwargs.get('NFOLDS', 5)
    kfold_shuffle = kwargs.get('kfold_shuffle', True)
    kfold_random_state = kwargs.get('kfold_random_state', 0)
    stratified_kfold_ytrain = kwargs.get('stratifed_kfold_y_value', None)
    inner_predict = kwargs.get('inner_predict', False)
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    
    kf_split = None
    if stratified_kfold_ytrain is None:
        kf = KFold(n_splits=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)
        kf_split = kf.split(x_train)
    else:
        kf = StratifiedKFold(n_splits=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)
        kf_split = kf.split(x_train, stratified_kfold_ytrain)
        
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))

    cv_sum = 0
    
    # before running model, print model param
    # lightgbm model and xgboost model use get_params()
    try:
        if clf.clf is not None:
            print(clf.clf)
    except:
        print(clf)
        print(clf.get_params())

    for i, (train_index, cross_index) in enumerate(kf_split):
        x_tr, x_cr = None, None
        y_tr, y_cr = None, None
        if isinstance(x_train, pd.DataFrame):
            x_tr, x_cr = x_train.iloc[train_index], x_train.iloc[cross_index]
            y_tr, y_cr = y_train.iloc[train_index], y_train.iloc[cross_index]
        else:
            x_tr, x_cr = x_train[train_index], x_train[cross_index]
            y_tr, y_cr = y_train[train_index], y_train[cross_index]

        clf.train(x_tr, y_tr, x_cr, y_cr)
        
        oof_train[cross_index] = clf.predict(x_cr)
        if inner_predict is True:
            oof_test += clf.predict(x_test)
        
        cv_score = eval_func(y_cr, oof_train[cross_index])
        
        print('Fold %d / ' % (i+1), 'CV-Score: %.6f' % cv_score)
        cv_sum = cv_sum + cv_score
        
        del x_tr, x_cr, y_tr, y_cr
        
    gc.collect()
    
    score = cv_sum / nfolds
    print("Average CV-Score: ", score)
    
    if inner_predict is True:
        oof_test = oof_test/nfolds
    else:
        # Using All Dataset, retrain
        clf.train(x_train, y_train)
        oof_test = clf.predict(x_test)

    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1), score

In [4]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(200):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
            x1[:,c*2+200] = x1[ids][:,c*2+200]
            con_index = (c*2+1)
            x1[:,con_index+200] = x1[ids][:,con_index+200]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(200):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
            x1[:,c*2+200] = x1[ids][:,c*2+200]
            con_index = (c*2+1)
            x1[:,con_index+200] = x1[ids][:,con_index+200]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

@time_decorator
def get_oof_agumentation(clf, x_train, y_train, x_test, eval_func, **kwargs):
    """
    nfolds = kwargs.get('NFOLDS', 5)
    kfold_shuffle = kwargs.get('kfold_shuffle', True)
    kfold_random_state = kwargs.get('kfold_random_state', 0)
    stratified_kfold_ytrain = kwargs.get('stratifed_kfold_y_value', None)
    inner_predict = kwargs.get('inner_predict', False)
    agumentation_number = kwargs.get('agumentation_number', 5)
    is_bagging_rank = kwargs.get('is_bagging_rank', False)
    """
    nfolds = kwargs.get('NFOLDS', 5)
    kfold_shuffle = kwargs.get('kfold_shuffle', True)
    kfold_random_state = kwargs.get('kfold_random_state', 0)
    stratified_kfold_ytrain = kwargs.get('stratifed_kfold_y_value', None)
    agumentation_number = kwargs.get('agumentation_number', 5)
    is_bagging_rank = kwargs.get('is_bagging_rank', False)
    
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    
    kf_split = None
    if stratified_kfold_ytrain is None:
        kf = KFold(n_splits=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)
        kf_split = kf.split(x_train)
    else:
        kf = StratifiedKFold(n_splits=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)
        kf_split = kf.split(x_train, stratified_kfold_ytrain)
        
    oof_train = np.zeros((ntrain,))
    
    if is_bagging_rank is True:
        oof_test = pd.DataFrame()
    else:
        oof_test = np.zeros((ntest,))
    
    cv_sum = 0
    
    # before running model, print model param
    # lightgbm model and xgboost model use get_params()
    try:
        if clf.clf is not None:
            print(clf.clf)
    except:
        print(clf)
        print(clf.get_params())

    for i, (train_index, cross_index) in enumerate(kf_split):
        x_tr, x_cr = None, None
        y_tr, y_cr = None, None
        if isinstance(x_train, pd.DataFrame):
            x_tr, x_cr = x_train.iloc[train_index], x_train.iloc[cross_index]
            y_tr, y_cr = y_train.iloc[train_index], y_train.iloc[cross_index]
        else:
            x_tr, x_cr = x_train[train_index], x_train[cross_index]
            y_tr, y_cr = y_train[train_index], y_train[cross_index]
        
        if is_bagging_rank is True:
            aug_valid = pd.DataFrame()
            aug_test = pd.DataFrame()
        else:
            aug_valid, aug_test = 0,0
        for aug_index in range(agumentation_number):
            print("\nAgumentation - Fold {} Aug {} Start!".format(i, aug_index))
            x_tr_aug, y_tr_aug = augment(x_tr.values, y_tr.values)
            clf.train(x_tr_aug, y_tr_aug, x_cr, y_cr)
            
            aug_valid_pred = clf.predict(x_cr)
            aug_test_pred = clf.predict(x_test)
            if is_bagging_rank is True:
                aug_valid[aug_index] = aug_valid_pred
                aug_test[aug_index] = aug_test_pred
            else:
                aug_valid += aug_valid_pred
                aug_test += aug_test_pred
                
            print("\nAgumentation - Fold {} Aug {} CV Score: {:.6f}".format(i, aug_index, roc_auc_score(y_cr, aug_valid_pred)))
        
        if is_bagging_rank is True:
            oof_train[cross_index] = (1 - aug_valid.rank(ascending=False).mean(axis=1)/aug_valid.shape[0])
            oof_test[i] = (1 - aug_test.rank(ascending=False).mean(axis=1)/aug_test.shape[0])
        else:
            oof_train[cross_index] = aug_valid/agumentation_number
            oof_test += (aug_test/agumentation_number)

        cv_score = eval_func(y_cr, oof_train[cross_index])
        
        print('Fold %d / ' % (i+1), 'CV-Score: %.6f' % cv_score)
        cv_sum = cv_sum + cv_score
        
        del x_tr, x_cr, y_tr, y_cr
        
    gc.collect()
    
    score = cv_sum / nfolds
    print("Average CV-Score: ", score)
    print("OOF CV-Score: ", eval_func(y_train, oof_train))
    
    if is_bagging_rank is True:
        test_pred = (1 - oof_test.rank(ascending=False).mean(axis=1)/oof_test.shape[0])
        test_pred = test_pred.values
    else:
        test_pred = oof_test/nfolds
    
    return oof_train.reshape(-1, 1), test_pred.reshape(-1, 1), score


In [5]:
@time_decorator
def kfold_test(clf, x_train, y_train, eval_func, **kwargs):
    nfolds = kwargs.get('NFOLDS', 5)
    kfold_shuffle = kwargs.get('kfold_shuffle', True)
    kfold_random_state = kwargs.get('kfold_random_sate', 0)
    stratified_kfold_ytrain = kwargs.get('stratifed_kfold_y_value', None)
    

    kf_split = None
    if stratified_kfold_ytrain is None:
        kf = KFold(n_splits=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)
        kf_split = kf.split(x_train)
    else:
        kf = StratifiedKFold(n_splits=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)
        kf_split = kf.split(x_train, stratified_kfold_ytrain)
        
    cv_sum = 0
    try:
        if clf.clf is not None:
            print(clf.clf)
    except:
        print(clf)
        print(clf.get_params())

    best_rounds = []
    ntrain = x_train.shape[0]
    oof_train = np.zeros((ntrain,))
    
    for i, (train_index, cross_index) in enumerate(kf_split):
        x_tr, x_cr = x_train.iloc[train_index], x_train.iloc[cross_index]
        y_tr, y_cr = y_train.iloc[train_index], y_train.iloc[cross_index]

        clf.train(x_tr, y_tr, x_cr, y_cr)
        
        oof_train[cross_index] = clf.predict(x_cr)
        cv_score = eval_func(y_cr, oof_train[cross_index])
        
        print('Fold %d / ' % (i+1), 'CV-Score: %.6f' % cv_score)
        cv_sum = cv_sum + cv_score
        best_rounds.append(clf.clf.best_iteration)

    score = cv_sum / nfolds
    print("Average CV-Score: ", score)
    print(eval_func(y_train, oof_train))
    return score, np.max(best_rounds)

In [6]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [7]:
def train_col(df):
    train_columns = [col for col in df.columns if col not in ['ID_code','target']]
    print(len(train_columns))
    """
    for col in ['var_7', 'var_10', 'var_17', 'var_27', 'var_30', 'var_38',
           'var_39', 'var_41', 'var_96', 'var_100', 'var_103', 'var_126',
           'var_136', 'var_158', 'var_161', 'var_185']:
        train_columns.remove(col)
    """
    print(len(train_columns))
    return train_columns

### Data Preprocessing

In [8]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [9]:
train_columns = train_col(train)

200
200


In [10]:
train[train_columns] = np.round(train[train_columns],4)
test[train_columns] = np.round(test[train_columns],4)

In [11]:
pb_idx = np.load('./data_temp/public_LB.npy')
pv_idx = np.load('./data_temp/private_LB.npy')

In [12]:
test_pb = test.iloc[pb_idx].sort_index().copy()
test_pv = test.iloc[pv_idx].sort_index().copy()

test_real = test_pb.append(test_pv)

In [13]:
data = pd.concat([train, test_real],sort=False)
data = data.reset_index(drop=True)
data[train_columns] = np.round(data[train_columns],4)

In [14]:
unique_df = data[['ID_code']]
con_df = data[['ID_code']]
for col in tqdm(train_columns):
    unique_df[col] = data[col].map(((data[col].value_counts() == 1) * 1).to_dict())
    con_df[col] = data[col].map((~(data[col].value_counts() == 1) * 1).to_dict())
    
for col in tqdm(train_columns):
    data[col + '_unique'] = np.around(data[col] * unique_df[col], 4)
    data[col + '_con'] = np.around(data[col] * con_df[col], 4)

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:13<00:00,  1.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 26.02it/s]


In [15]:
train_columns = train_col(data)
uniquecol_list = [col for col in train_columns if col.find('unique')!= -1]
for col in uniquecol_list:
    data.loc[data[col]==0,col]=np.nan

600
600


In [16]:
train = data[~data.target.isna()]
test = data[data.target.isna()]

In [17]:
train_columns = train_col(train)

600
600


In [18]:
x_train = train.copy()
y_train = train['target']
x_test = test.copy()

### Model

In [19]:
lgbm_param3 = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "nthread": 8,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "seed": 42
}

lgbm_model3 = LgbmWrapper(params=lgbm_param3, num_rounds = 50000, ealry_stopping=3500,
                                   verbose_eval=1000)

print(len(train_columns))
lgbm_train_aug3, lgbm_test_aug3, lgbm_cv_score_aug3 = get_oof_agumentation(lgbm_model3, x_train[train_columns], y_train, 
                                                                           x_test[train_columns], roc_auc_score, 
                                                                           NFOLDS=5, kfold_random_state=42, 
                                                                           stratifed_kfold_y_value=y_train, 
                                                                           agumentation_number=5 )

x_train_second_layer = pd.DataFrame(lgbm_train_aug3)
x_test_second_layer = pd.DataFrame(lgbm_test_aug3)
lgb_train = pd.concat([train['ID_code'], pd.DataFrame(x_train_second_layer)], axis=1)
lgb_test = pd.concat([test.reset_index(drop=True)['ID_code'], pd.DataFrame(x_test_second_layer)], axis=1)
lgb_train.to_csv(f'input/train_lgb_wonho_prod_cv_{lgbm_cv_score_aug3}.csv', index=False)
lgb_test.to_csv(f'input/test_lgb_wonho_prod_cv_{lgbm_cv_score_aug3}.csv', index=False)

600

StartTime:  2019-04-07 02:12:25.644318
<__main__.LgbmWrapper object at 0x000001E510F97198>
{'objective': 'binary', 'metric': 'auc', 'boosting': 'gbdt', 'max_depth': -1, 'num_leaves': 13, 'learning_rate': 0.01, 'bagging_freq': 5, 'bagging_fraction': 0.4, 'feature_fraction': 0.05, 'min_data_in_leaf': 80, 'min_sum_heassian_in_leaf': 10, 'tree_learner': 'serial', 'boost_from_average': 'false', 'nthread': 8, 'bagging_seed': 42, 'verbosity': -1, 'seed': 0}

Agumentation - Fold 0 Aug 0 Start!


KeyboardInterrupt: 

# Model History

원호님 unique con

In [None]:
Fold 1 /  CV-Score: 0.918565
Fold 2 /  CV-Score: 0.918630
Fold 3 /  CV-Score: 0.925149
Fold 4 /  CV-Score: 0.918599
Fold 5 /  CV-Score: 0.916176
600
Fold 1 /  CV-Score: 0.919139
Fold 2 /  CV-Score: 0.919246
Fold 3 /  CV-Score: 0.925560
Fold 4 /  CV-Score: 0.919683
Fold 5 /  CV-Score: 0.917102
Average CV-Score:  0.9201459354177967
StartTime:  2019-04-02 10:21:24.576633
<__main__.LgbmWrapper object at 0x000002684F1127F0>
{'objective': 'binary', 'metric': 'auc', 'boosting': 'gbdt', 'max_depth': -1, 'num_leaves': 13, 'learning_rate': 0.01, 'bagging_freq': 5, 'bagging_fraction': 0.4, 'feature_fraction': 0.05, 'min_data_in_leaf': 80, 'min_sum_heassian_in_leaf': 10, 'tree_learner': 'serial', 'boost_from_average': 'false', 'nthread': 44, 'bagging_seed': 42, 'verbosity': -1, 'seed': 0}

StartTime:  2019-04-02 10:21:25.040010
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.907899	valid_1's auc: 0.891966
[2000]	training's auc: 0.925801	valid_1's auc: 0.905426
[3000]	training's auc: 0.936271	valid_1's auc: 0.91154
[4000]	training's auc: 0.943512	valid_1's auc: 0.914939
[5000]	training's auc: 0.949263	valid_1's auc: 0.916695
[6000]	training's auc: 0.954219	valid_1's auc: 0.91764
[7000]	training's auc: 0.958745	valid_1's auc: 0.918279
[8000]	training's auc: 0.962872	valid_1's auc: 0.918643
[9000]	training's auc: 0.966675	valid_1's auc: 0.918858
[10000]	training's auc: 0.97014	valid_1's auc: 0.91895
[11000]	training's auc: 0.973361	valid_1's auc: 0.91902
[12000]	training's auc: 0.976274	valid_1's auc: 0.919015
[13000]	training's auc: 0.978945	valid_1's auc: 0.919097
[14000]	training's auc: 0.981459	valid_1's auc: 0.919062
[15000]	training's auc: 0.983649	valid_1's auc: 0.918996
[16000]	training's auc: 0.985654	valid_1's auc: 0.918898
Early stopping, best iteration is:
[12899]	training's auc: 0.97869	valid_1's auc: 0.919139
EndTime:  2019-04-02 10:25:08.818998
TotalTime:  223.77897906303406
Fold 1 /  CV-Score: 0.919139

StartTime:  2019-04-02 10:25:25.794274
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.907364	valid_1's auc: 0.894187
[2000]	training's auc: 0.925377	valid_1's auc: 0.906872
[3000]	training's auc: 0.935924	valid_1's auc: 0.912815
[4000]	training's auc: 0.943083	valid_1's auc: 0.915741
[5000]	training's auc: 0.948861	valid_1's auc: 0.91722
[6000]	training's auc: 0.953989	valid_1's auc: 0.918044
[7000]	training's auc: 0.958508	valid_1's auc: 0.918597
[8000]	training's auc: 0.962649	valid_1's auc: 0.918907
[9000]	training's auc: 0.966461	valid_1's auc: 0.919094
[10000]	training's auc: 0.970017	valid_1's auc: 0.919069
[11000]	training's auc: 0.973299	valid_1's auc: 0.919179
[12000]	training's auc: 0.976237	valid_1's auc: 0.919176
[13000]	training's auc: 0.978965	valid_1's auc: 0.919146
[14000]	training's auc: 0.981442	valid_1's auc: 0.919074
[15000]	training's auc: 0.983648	valid_1's auc: 0.919001
Early stopping, best iteration is:
[12334]	training's auc: 0.977168	valid_1's auc: 0.919246
EndTime:  2019-04-02 10:28:58.490950
TotalTime:  212.69857168197632
Fold 2 /  CV-Score: 0.919246

StartTime:  2019-04-02 10:29:14.493204
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.906475	valid_1's auc: 0.897607
[2000]	training's auc: 0.924184	valid_1's auc: 0.911543
[3000]	training's auc: 0.93475	valid_1's auc: 0.918184
[4000]	training's auc: 0.942014	valid_1's auc: 0.921466
[5000]	training's auc: 0.947833	valid_1's auc: 0.923241
[6000]	training's auc: 0.952986	valid_1's auc: 0.92411
[7000]	training's auc: 0.957593	valid_1's auc: 0.924833
[8000]	training's auc: 0.961815	valid_1's auc: 0.925197
[9000]	training's auc: 0.965722	valid_1's auc: 0.925269
[10000]	training's auc: 0.969275	valid_1's auc: 0.925507
[11000]	training's auc: 0.972582	valid_1's auc: 0.925415
[12000]	training's auc: 0.975625	valid_1's auc: 0.925333
[13000]	training's auc: 0.978394	valid_1's auc: 0.925303
Early stopping, best iteration is:
[9775]	training's auc: 0.968467	valid_1's auc: 0.92556
EndTime:  2019-04-02 10:32:12.655822
TotalTime:  178.1623387336731
Fold 3 /  CV-Score: 0.925560

StartTime:  2019-04-02 10:32:26.191818
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.907943	valid_1's auc: 0.889386
[2000]	training's auc: 0.925719	valid_1's auc: 0.903155
[3000]	training's auc: 0.936133	valid_1's auc: 0.910187
[4000]	training's auc: 0.943344	valid_1's auc: 0.913659
[5000]	training's auc: 0.949066	valid_1's auc: 0.915768
[6000]	training's auc: 0.954016	valid_1's auc: 0.916778
[7000]	training's auc: 0.9584	valid_1's auc: 0.917794
[8000]	training's auc: 0.962452	valid_1's auc: 0.918278
[9000]	training's auc: 0.966195	valid_1's auc: 0.918751
[10000]	training's auc: 0.96964	valid_1's auc: 0.919113
[11000]	training's auc: 0.972922	valid_1's auc: 0.91937
[12000]	training's auc: 0.975888	valid_1's auc: 0.919537
[13000]	training's auc: 0.978579	valid_1's auc: 0.9196
[14000]	training's auc: 0.981051	valid_1's auc: 0.919614
[15000]	training's auc: 0.983308	valid_1's auc: 0.919654
[16000]	training's auc: 0.98536	valid_1's auc: 0.919641
[17000]	training's auc: 0.987254	valid_1's auc: 0.919593
[18000]	training's auc: 0.988905	valid_1's auc: 0.919664
Early stopping, best iteration is:
[14888]	training's auc: 0.983057	valid_1's auc: 0.919683
EndTime:  2019-04-02 10:36:29.128058
TotalTime:  242.93512678146362
Fold 4 /  CV-Score: 0.919683

StartTime:  2019-04-02 10:36:48.824477
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.908833	valid_1's auc: 0.888716
[2000]	training's auc: 0.926489	valid_1's auc: 0.901878
[3000]	training's auc: 0.936652	valid_1's auc: 0.908652
[4000]	training's auc: 0.943709	valid_1's auc: 0.91198
[5000]	training's auc: 0.949327	valid_1's auc: 0.913992
[6000]	training's auc: 0.954268	valid_1's auc: 0.915179
[7000]	training's auc: 0.958705	valid_1's auc: 0.916008
[8000]	training's auc: 0.962833	valid_1's auc: 0.916351
[9000]	training's auc: 0.966562	valid_1's auc: 0.916696
[10000]	training's auc: 0.969998	valid_1's auc: 0.916923
[11000]	training's auc: 0.973243	valid_1's auc: 0.916969
[12000]	training's auc: 0.976171	valid_1's auc: 0.917066
[13000]	training's auc: 0.978886	valid_1's auc: 0.916995
[14000]	training's auc: 0.981385	valid_1's auc: 0.916894
[15000]	training's auc: 0.983599	valid_1's auc: 0.91693
Early stopping, best iteration is:
[12092]	training's auc: 0.976437	valid_1's auc: 0.917102
EndTime:  2019-04-02 10:40:16.054344
TotalTime:  207.22993779182434
Fold 5 /  CV-Score: 0.917102
Average CV-Score:  0.9201459354177967
EndTime:  2019-04-02 10:40:32.841606
TotalTime:  1148.2649710178375

원호님 피쳐 unique만

In [None]:
[30822]	training's auc: 0.997212	valid_1's auc: 0.912799
EndTime:  2019-03-31 00:59:26.279200
TotalTime:  444.695020198822
Fold 1 /  CV-Score: 0.912799
    
[33555]	training's auc: 0.998189	valid_1's auc: 0.913422
EndTime:  2019-03-31 01:08:06.521572
TotalTime:  479.79382133483887
Fold 2 /  CV-Score: 0.913422
    
[25323]	training's auc: 0.993383	valid_1's auc: 0.919691
EndTime:  2019-03-31 01:15:09.857498
TotalTime:  378.5974488258362
Fold 3 /  CV-Score: 0.919691
    
[34121]	training's auc: 0.99833	valid_1's auc: 0.913545
EndTime:  2019-03-31 01:23:58.103107
TotalTime:  493.46868228912354
Fold 4 /  CV-Score: 0.913545
    
[31306]	training's auc: 0.997418	valid_1's auc: 0.911077
EndTime:  2019-03-31 01:32:15.184116
TotalTime:  451.7067024707794
Fold 5 /  CV-Score: 0.911077
Average CV-Score:  0.9141068663032271

prod 결과

In [None]:
StartTime:  2019-03-30 15:38:32.118556
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.91178	valid_1's auc: 0.90005
[2000]	training's auc: 0.920072	valid_1's auc: 0.904325
[3000]	training's auc: 0.92778	valid_1's auc: 0.907221
[4000]	training's auc: 0.934812	valid_1's auc: 0.909616
[5000]	training's auc: 0.94112	valid_1's auc: 0.91127
[6000]	training's auc: 0.946761	valid_1's auc: 0.912517
[7000]	training's auc: 0.951762	valid_1's auc: 0.913306
[8000]	training's auc: 0.956583	valid_1's auc: 0.913923
[9000]	training's auc: 0.960856	valid_1's auc: 0.914531
[10000]	training's auc: 0.964858	valid_1's auc: 0.914981
[11000]	training's auc: 0.968527	valid_1's auc: 0.915386
[12000]	training's auc: 0.971889	valid_1's auc: 0.915638
[13000]	training's auc: 0.974987	valid_1's auc: 0.915694
[14000]	training's auc: 0.977749	valid_1's auc: 0.915887
[15000]	training's auc: 0.980346	valid_1's auc: 0.915779
[16000]	training's auc: 0.982714	valid_1's auc: 0.91594
[17000]	training's auc: 0.984817	valid_1's auc: 0.91601
[18000]	training's auc: 0.986725	valid_1's auc: 0.916106
[19000]	training's auc: 0.988511	valid_1's auc: 0.916132
[20000]	training's auc: 0.990012	valid_1's auc: 0.916173
[21000]	training's auc: 0.991402	valid_1's auc: 0.916199
[22000]	training's auc: 0.992592	valid_1's auc: 0.916233
[23000]	training's auc: 0.993667	valid_1's auc: 0.916175
[24000]	training's auc: 0.994575	valid_1's auc: 0.916167
[25000]	training's auc: 0.995425	valid_1's auc: 0.916117
Early stopping, best iteration is:
[21862]	training's auc: 0.99242	valid_1's auc: 0.91625
EndTime:  2019-03-30 15:44:10.832150
TotalTime:  338.71357440948486
Fold 1 /  CV-Score: 0.916250
    
[22909]	training's auc: 0.993488	valid_1's auc: 0.916135
EndTime:  2019-03-30 15:50:24.560323
TotalTime:  342.6836714744568
Fold 2 /  CV-Score: 0.916135
    
[21873]	training's auc: 0.991961	valid_1's auc: 0.923999
EndTime:  2019-03-30 15:56:29.454804
TotalTime:  335.19072675704956
Fold 3 /  CV-Score: 0.923999
    
[29033]	training's auc: 0.997673	valid_1's auc: 0.917013
EndTime:  2019-03-30 16:04:03.982766
TotalTime:  425.92881441116333
Fold 4 /  CV-Score: 0.917013
    
[22344]	training's auc: 0.992814	valid_1's auc: 0.913404
EndTime:  2019-03-30 16:10:20.605982
TotalTime:  338.9884412288666
Fold 5 /  CV-Score: 0.913404
Average CV-Score:  0.9173603456204831
EndTime:  2019-03-30 16:10:49.304071
TotalTime:  1937.5096695423126
    
Average CV-Score:  0.9186659561402918

# Submission

In [225]:
x_train_second_layer = pd.DataFrame(lgbm_train_aug3)
x_test_second_layer = pd.DataFrame(lgbm_test_aug3)
lgb_train = pd.concat([train['ID_code'], pd.DataFrame(x_train_second_layer)], axis=1)
lgb_test = pd.concat([test.reset_index(drop=True)['ID_code'], pd.DataFrame(x_test_second_layer)], axis=1)
lgb_train.to_csv(f'input/train_lgb_wonho_prod_cv_{lgbm_cv_score_aug3}.csv', index=False)
lgb_test.to_csv(f'input/test_lgb_wonho_prod_cv_{lgbm_cv_score_aug3}.csv', index=False)

In [226]:
lgb_test = pd.concat([test.reset_index(drop=True)['ID_code'], pd.DataFrame(x_test_second_layer)], axis=1)
lgb_test.columns = ['ID_code','target']

In [227]:
test_temp = pd.read_csv('input/test_lgb_aug_df_v8040q1213_rankbagging_v1_cv_0.901779.csv')
test_temp.columns = ['ID_code','target']

In [228]:
submission = pd.concat([test_temp.iloc[syn_index],lgb_test])

In [229]:
submission = pd.concat([test_temp.iloc[syn_index],lgb_test])
submission['Index'] = submission['ID_code'].apply(lambda x: int(x.split('_')[-1]))
submission = submission.sort_values("Index").reset_index(drop=True)
del submission['Index']
submission

Unnamed: 0,ID_code,target
0,test_0,0.754711
1,test_1,0.883792
2,test_2,0.859432
3,test_3,0.101573
4,test_4,0.555223
5,test_5,0.019554
6,test_6,0.105938
7,test_7,0.088856
8,test_8,0.026151
9,test_9,0.138631


In [230]:
submission.to_csv(f'sub_lgb_wonho_prod_cv_{lgbm_cv_score_aug3}.csv',index=False)