In [1]:
import gc
from datetime import datetime, timedelta,date
import time
import warnings
import itertools
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection,johnson_lindenstrauss_min_dim
from sklearn.decomposition import PCA, FastICA,NMF,LatentDirichletAllocation,IncrementalPCA,MiniBatchSparsePCA
from sklearn.decomposition import TruncatedSVD,FactorAnalysis,KernelPCA

import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.model_selection import StratifiedKFold, KFold

from scipy.stats import ks_2samp
from functools import wraps
import functools
#settings
warnings.filterwarnings('ignore')
np.random.seed(2018)

# Common Utility Func

In [2]:
def get_prefix(group_col, target_col, prefix=None):
    if isinstance(group_col, list) is True:
        g = '_'.join(group_col)
    else:
        g = group_col
    if isinstance(target_col, list) is True:
        t = '_'.join(target_col)
    else:
        t = target_col
    if prefix is not None:
        return prefix + '_' + g + '_' + t
    return g + '_' + t
    
def groupby_helper(df, group_col, target_col, agg_method, prefix_param=None):
    try:
        prefix = get_prefix(group_col, target_col, prefix_param)
        print(group_col, target_col, agg_method)
        group_df = df.groupby(group_col)[target_col].agg(agg_method)
        group_df.columns = ['{}_{}'.format(prefix, m) for m in agg_method]
    except BaseException as e:
        print(e)
    return group_df.reset_index()

In [3]:
def time_decorator(func):
    
    @wraps(func)
    def wrapper(*args, **kwargs):
        print("\nStartTime: ", datetime.now() + timedelta(hours=9))
        start_time = time.time()
        
        df = func(*args, **kwargs)
        
        print("EndTime: ", datetime.now() + timedelta(hours=9))  
        print("TotalTime: ", time.time() - start_time)
        return df
        
    return wrapper

# Stacking Utility

In [39]:
class SklearnWrapper(object):
    def __init__(self, clf, params=None, **kwargs):
        params['random_state'] = kwargs.get('seed', 0)
        self.clf = clf(**params)
        self.is_classification_problem = True
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        if len(np.unique(y_train)) > 30:
            self.is_classification_problem = False
            
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        if self.is_classification_problem is True:
            return self.clf.predict_proba(x)[:,1]
        else:
            return self.clf.predict(x)
    

class CatboostWrapper(object):
    def __init__(self, params=None, **kwargs):
        try:
            if params is None:
                raise("Parameter를 입력하세요!!")
            self.param = params
            seed = kwargs.get('seed', None)
            
            if seed is not None:
                self.param['random_seed'] = seed
                
            num_rounds = kwargs.get('num_rounds', None)
            if num_rounds is not None:
                self.param['num_boost_round'] = num_rounds
            
            early_stopping = kwargs.get('ealry_stopping', None)
            if early_stopping is not None:
                self.param['early_stopping_rounds'] = early_stopping
            
            eval_function = kwargs.get('eval_function', None)
            if eval_function is not None:
                self.param['eval_metric'] = eval_function
            
            verbose_eval = kwargs.get('verbose_eval', 100)
            if verbose_eval is not None:
                self.param['verbose'] = verbose_eval
                
            self.best_round = 0
        except BaseException as e:
            print(e)
         
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        """
        x_cross or y_cross is None
        -> model train limted num_rounds
        
        x_cross and y_cross is Not None
        -> model train using validation set
        """
        if isinstance(y_train, pd.DataFrame) is True:
            y_train = y_train[y_train.columns[0]]
            if y_cross is not None:
                y_cross = y_cross[y_cross.columns[0]]

        if x_cross is None:
            train_round = self.clf.tree_count_
            if self.best_round > 0:
                train_round = self.best_round
            
            self.param['iterations'] = train_round
            self.clf = cb.CatBoostRegressor(**self.param)
            self.clf.fit(x_train, y_train, use_best_model=True)
        else:
            self.clf = cb.CatBoostRegressor(**self.param)
            self.clf.fit(x_train, y_train,
                         eval_set=[(x_train, y_train),(x_cross, y_cross)],
                         use_best_model=True)
            self.best_round = max(self.best_round, self.clf.tree_count_)
            
        gc.collect()
    
    def predict(self, x):
        return self.clf.predict(x)
        
    def get_params(self):
        return self.param
    
    
class XgbWrapper(object):
    def __init__(self, params=None, **kwargs):
        self.param = params
        self.param['seed'] = kwargs.get('seed', 0)
        self.num_rounds = kwargs.get('num_rounds', 1000)
        self.early_stopping = kwargs.get('ealry_stopping', 100)

        self.eval_function = kwargs.get('eval_function', None)
        self.verbose_eval = kwargs.get('verbose_eval', 100)
        self.best_round = 0
    
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        need_cross_validation = True
       
        if isinstance(y_train, pd.DataFrame) is True:
            y_train = y_train[y_train.columns[0]]
            if y_cross is not None:
                y_cross = y_cross[y_cross.columns[0]]
                

        if x_cross is None:
            dtrain = xgb.DMatrix(x_train, label=y_train, silent= True)
            train_round = self.best_round
            if self.best_round == 0:
                train_round = self.num_rounds
            
            print(train_round)
            self.clf = xgb.train(self.param, dtrain, train_round)
            del dtrain
        else:
            dtrain = xgb.DMatrix(x_train, label=y_train, silent=True)
            dvalid = xgb.DMatrix(x_cross, label=y_cross, silent=True)
            watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

            self.clf = xgb.train(self.param, dtrain, self.num_rounds, watchlist, feval=self.eval_function,
                                 early_stopping_rounds=self.early_stopping,
                                 verbose_eval=self.verbose_eval)
            self.best_round = max(self.best_round, self.clf.best_iteration)

    def predict(self, x):
        return self.clf.predict(xgb.DMatrix(x), ntree_limit=self.best_round)

    def get_params(self):
        return self.param
    
    
class LgbmWrapper(object):
    def __init__(self, params=None, **kwargs):
        self.param = params
        self.param['seed'] = kwargs.get('seed', 0)
        self.num_rounds = kwargs.get('num_rounds', 1000)
        self.early_stopping = kwargs.get('ealry_stopping', 100)

        self.eval_function = kwargs.get('eval_function', None)
        self.verbose_eval = kwargs.get('verbose_eval', 100)
        self.best_round = 0
        
    @time_decorator
    def train(self, x_train, y_train, x_cross=None, y_cross=None):
        """
        x_cross or y_cross is None
        -> model train limted num_rounds
        
        x_cross and y_cross is Not None
        -> model train using validation set
        """
        if isinstance(y_train, pd.DataFrame) is True:
            y_train = y_train[y_train.columns[0]]
            if y_cross is not None:
                y_cross = y_cross[y_cross.columns[0]]

        if x_cross is None:
            dtrain = lgb.Dataset(x_train, label=y_train, silent= True)
            train_round = self.best_round
            if self.best_round == 0:
                train_round = self.num_rounds
                
            self.clf = lgb.train(self.param, train_set=dtrain, num_boost_round=train_round)
            del dtrain   
        else:
            dtrain = lgb.Dataset(x_train, label=y_train, silent=True)
            dvalid = lgb.Dataset(x_cross, label=y_cross, silent=True)
            self.clf = lgb.train(self.param, train_set=dtrain, num_boost_round=self.num_rounds, valid_sets=[dtrain, dvalid],
                                  feval=self.eval_function, early_stopping_rounds=self.early_stopping,
                                  verbose_eval=self.verbose_eval)
            self.best_round = max(self.best_round, self.clf.best_iteration)
            del dtrain, dvalid
            
        gc.collect()
    
    def predict(self, x):
        return self.clf.predict(x, num_iteration=self.clf.best_iteration)
    
    def plot_importance(self):
        lgb.plot_importance(self.clf, max_num_features=50, height=0.7, figsize=(10,30))
        plt.show()
        
    def get_params(self):
        return self.param

In [7]:
@time_decorator
def get_oof(clf, x_train, y_train, x_test, eval_func, **kwargs):
    nfolds = kwargs.get('NFOLDS', 5)
    kfold_shuffle = kwargs.get('kfold_shuffle', True)
    kfold_random_state = kwargs.get('kfold_random_state', 0)
    stratified_kfold_ytrain = kwargs.get('stratifed_kfold_y_value', None)
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]

    kf = StratifiedKFold(n_splits=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)

    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))

    cv_sum = 0
    
    if stratified_kfold_ytrain is None:
        stratified_kfold_ytrain = y_train
    
    # before running model, print model param
    # lightgbm model and xgboost model use get_params()
    try:
        if clf.clf is not None:
            print(clf.clf)
    except:
        print(clf)
        print(clf.get_params())

    for i, (train_index, cross_index) in enumerate(kf.split(x_train, stratified_kfold_ytrain)):
        x_tr, x_cr = None, None
        y_tr, y_cr = None, None
        if isinstance(x_train, pd.DataFrame):
            x_tr, x_cr = x_train.iloc[train_index], x_train.iloc[cross_index]
            y_tr, y_cr = y_train.iloc[train_index], y_train.iloc[cross_index]
        else:
            x_tr, x_cr = x_train[train_index], x_train[cross_index]
            y_tr, y_cr = y_train[train_index], y_train[cross_index]

        clf.train(x_tr, y_tr, x_cr, y_cr)
        
        oof_train[cross_index] = clf.predict(x_cr)

        cv_score = eval_func(y_cr, oof_train[cross_index])
        
        print('Fold %d / ' % (i+1), 'CV-Score: %.6f' % cv_score)
        cv_sum = cv_sum + cv_score
        
        del x_tr, x_cr, y_tr, y_cr
        
    gc.collect()
    
    score = cv_sum / nfolds
    print("Average CV-Score: ", score)

    # Using All Dataset, retrain
    clf.train(x_train, y_train)
    oof_test = clf.predict(x_test)

    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1), score

In [None]:
@time_decorator
def kfold_test(clf, x_train, y_train, eval_func, **kwargs):
    nfolds = kwargs.get('NFOLDS', 5)
    kfold_shuffle = kwargs.get('kfold_shuffle', True)
    kfold_random_state = kwargs.get('kfold_random_sate', 0)

    ntrain = x_train.shape[0]

    kf = KFold(ntrain, n_folds=nfolds, shuffle=kfold_shuffle, random_state=kfold_random_state)
    
    cv_sum = 0
    try:
        if clf.clf is not None:
            print(clf.clf)
    except:
        print(clf)
        print(clf.get_params())

    best_rounds = []
    for i, (train_index, cross_index) in enumerate(kf):
        x_tr, x_cr = x_train.iloc[train_index], x_train.iloc[cross_index]
        y_tr, y_cr = y_train.iloc[train_index], y_train.iloc[cross_index]

        clf.train(x_tr, y_tr, x_cr, y_cr)

        cv_score = eval_func(y_cr, clf.predict(x_cr))

        print('Fold %d / ' % (i+1), 'CV-Score: %.6f' % cv_score)
        cv_sum = cv_sum + cv_score
        best_rounds.append(clf.clf.best_iteration)

    score = cv_sum / nfolds
    print("Average CV-Score: ", score)

    return score, np.max(best_rounds)

# Stacking Pipeline

>* LGBM Deep, Middle, Shallow
>* LGBM Dart
>* Catboost, Xgboost Deep, Middle
>* Random Forest, ExtraTree
>* Lasso, Ridge
>* NN Deep, Middle, Shallow -> 이건 아직<br>
오늘은 위에것만

# Data Load

In [8]:
train_df = pd.read_csv('input/train_v1.csv')
test_df = pd.read_csv('input/test_v1.csv')

In [9]:
#train_df = all_df.loc[all_df['target'].notnull()]
#test_df = all_df.loc[all_df['target'].isnull()]

train_df['outliers'] = 0
train_df.loc[train_df['target'] < -30, 'outliers'] = 1
train_df['outliers'].value_counts()

for f in ['feature_1','feature_2','feature_3']:
    order_label = train_df.groupby([f])['outliers'].mean()
    train_df[f] = train_df[f].map(order_label)
    test_df[f] = test_df[f].map(order_label)

group_df = groupby_helper(train_df,['dayofyear'], 'outliers',['mean'])
train_df = train_df.merge(group_df, on=['dayofyear'], how='left')
test_df = test_df.merge(group_df, on=['dayofyear'], how='left')

group_df = groupby_helper(train_df,['elapsed_time'], 'outliers',['mean'])
train_df = train_df.merge(group_df, on=['elapsed_time'], how='left')
test_df = test_df.merge(group_df, on=['elapsed_time'], how='left')

train_columns = [c for c in train_df.columns if c not in ['card_id', 'first_active_month','target','outliers']]
train_columns

['dayofyear'] outliers ['mean']
['elapsed_time'] outliers ['mean']


['feature_1',
 'feature_2',
 'feature_3',
 'card_id_card_id_month_lag_purchase_amount_count_sum',
 'card_id_card_id_month_lag_purchase_amount_count_mean',
 'card_id_card_id_month_lag_purchase_amount_count_std',
 'card_id_card_id_month_lag_purchase_amount_mean_sum',
 'card_id_card_id_month_lag_purchase_amount_mean_mean',
 'card_id_card_id_month_lag_purchase_amount_mean_std',
 'auth_0_card_id_month_nunique',
 'auth_0_card_id_month_max',
 'auth_0_card_id_month_min',
 'auth_0_card_id_month_mean',
 'auth_0_card_id_month_std',
 'auth_1_card_id_month_nunique',
 'auth_1_card_id_month_max',
 'auth_1_card_id_month_min',
 'auth_1_card_id_month_mean',
 'auth_1_card_id_month_std',
 'new_hist_card_id_month_nunique',
 'new_hist_card_id_month_max',
 'new_hist_card_id_month_min',
 'new_hist_card_id_month_mean',
 'card_id_merchant_id_nunique',
 'auth_0_card_id_merchant_category_id_nunique',
 'auth_1_card_id_merchant_category_id_nunique',
 'auth_0_card_id_subsector_id_nunique',
 'auth_1_card_id_subsector

In [10]:
x_train = train_df.copy()
x_test = test_df.copy()
y_train = train_df['target']

# Sklearn

ExtraTreeRegressior

In [20]:
def rmse(y_true, y_predict):
    return np.sqrt(mean_squared_error(y_true, y_predict))

In [115]:
et_parmas = {
    'criterion':'mse', 'max_leaf_nodes':-1, 'n_estimators':800, 'min_impurity_split':0.0000001,
    'max_features':0.6, 'max_depth':9, 'min_samples_leaf':30, 'min_samples_split':2,
    'min_weight_fraction_leaf':0.0, 'bootstrap':True,'n_jobs':-1,'warm_start':False,
    'random_state':6, 'verbose':True
}

rf_params = {
    'criterion':'mse', 'max_leaf_nodes':-1, 'n_estimators':500, 'min_impurity_split':0.0000001,
    'max_features':0.6, 'max_depth':9, 'min_samples_leaf':30, 'min_samples_split':2,
    'min_weight_fraction_leaf':0.0, 'bootstrap':True,'n_jobs':-1,
    'random_state':6, 'verbose':True
}

lasso_params={
    'alpha':0.003,
    'normalize':True,
    'max_iter':200,'fit_intercept':True,'tol':0.007,
    'warm_start':True
}

ridge_params={
    'alpha':0.2,
    'normalize':True,
    'max_iter':200,'fit_intercept':False,'solver':'auto'
}

lgbm_param1 = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.015,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 24,
         "seed": 6}

lgbm_param2 = {'num_leaves': 22,
         'min_data_in_leaf': 70, 
         'objective':'regression',
         'max_depth': 7,
         'learning_rate': 0.015,
         "min_child_samples": 50,
         "boosting": "gbdt",
         "subsample": 0.6,
         "colsample": 0.8,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 24,
         "seed": 6}

lgbm_param3 = {'num_leaves': 10,
         'min_data_in_leaf': 10, 
         'objective':'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "min_child_samples": 10,
         "boosting": "gbdt",
         "subsample": 0.8,
         "colsample": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 24,
         "seed": 6}

lgbm_param4 = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.02,
         "min_child_samples": 20,
         "boosting": "dart",
         'drop_rate':0.3, 
         'skip_drop':0.4,
         'max_drop':50,
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 24,
         "seed": 6}

xgb_params1 = {
    'booster':'gbtree', 'objective':'reg:linear', 'max_leaves':0, 'eta':0.02, 'gamma':1,
    'max_depth':4, 'colsample_bylevel':1.0, 'min_child_weight':4.0, 'max_delta_step':0.0, 'subsample':0.8, 
    'colsample_bytree':0.5,'alpha':1.0, 'lambda':5.0, 'seed':6,'eval_metric':'rmse'
}

cat_param = {'iterations':3000,'rsm':0.8,'depth':5, 'learning_rate':0.037, 'bootstrap_type':'Bernoulli',
             'eval_metric':'RMSE','l2_leaf_reg':40,'od_wait':100,'loss_function':'RMSE','od_type':'Iter',
            'random_seed':6,'subsample':0.7,'verbose':100}

In [116]:
et_model = SklearnWrapper(clf = ExtraTreesRegressor,params=et_parmas)
rf_model = SklearnWrapper(clf = RandomForestRegressor,params=rf_params)

lgbm_model1 = LgbmWrapper(params=lgbm_param1, num_rounds = 10000, ealry_stopping=100,
                                   verbose_eval=100, base_score=True, maximize=False,
                                   y_value_log=False)
lgbm_model2 = LgbmWrapper(params=lgbm_param2, num_rounds = 10000, ealry_stopping=100,
                                   verbose_eval=100, base_score=True, maximize=False,
                                   y_value_log=False)
lgbm_model3 = LgbmWrapper(params=lgbm_param3, num_rounds = 10000, ealry_stopping=100,
                                   verbose_eval=100, base_score=True, maximize=False,
                                   y_value_log=False)
lgbm_model4 = LgbmWrapper(params=lgbm_param4, num_rounds = 10000, ealry_stopping=100,
                                   verbose_eval=100, base_score=True, maximize=False,
                                   y_value_log=False)

xgb_model1 = XgbWrapper(params=xgb_params1, num_rounds = 10000, ealry_stopping=100,
                                   verbose_eval=100, base_score=True, maximize=False,
                                   y_value_log=False)

cat_model = CatboostWrapper(params=cat_param,verbose_eval=100)

In [None]:
et_train, et_test, et_cv_score = get_oof(et_model, x_train[train_columns].fillna(-1), y_train, x_test[train_columns].fillna(-1), 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)

rf_train, rf_test, rf_cv_score = get_oof(rf_model, x_train[train_columns].fillna(-1), y_train, x_test[train_columns].fillna(-1), 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)

lgbm1_train, lgbm1_test, lgbm1_cv_score = get_oof(lgbm_model1, x_train[train_columns], y_train, x_test[train_columns], 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)
lgbm2_train, lgbm2_test, lgbm2_cv_score = get_oof(lgbm_model2, x_train[train_columns], y_train, x_test[train_columns], 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)
lgbm3_train, lgbm3_test, lgbm3_cv_score = get_oof(lgbm_model3, x_train[train_columns], y_train, x_test[train_columns], 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)
lgbm4_train, lgbm4_test, lgbm4_cv_score = get_oof(lgbm_model4, x_train[train_columns], y_train, x_test[train_columns], 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)

xgb_train, xgb_test, xgb_cv_score = get_oof(xgb_model1, x_train[train_columns], y_train, x_test[train_columns], 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)

cat_train, cat_test, cat_cv_score = get_oof(cat_model, x_train[train_columns], y_train, x_test[train_columns], 
                            rmse, NFOLDS=9, kfold_random_state=4950, stratifed_kfold_y_value=x_train['outliers'].values)


StartTime:  2019-01-25 01:29:53.066259
ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=9,
          max_features=0.6, max_leaf_nodes=-1, min_impurity_decrease=0.0,
          min_impurity_split=1e-07, min_samples_leaf=30,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=800, n_jobs=-1, oob_score=False, random_state=0,
          verbose=True, warm_start=False)

StartTime:  2019-01-25 01:29:53.315035


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 752 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.2min finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s


EndTime:  2019-01-25 01:31:04.011213
TotalTime:  70.69621539115906


[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:    0.3s
[Parallel(n_jobs=24)]: Done 800 out of 800 | elapsed:    0.4s finished


Fold 1 /  CV-Score: 3.677008

StartTime:  2019-01-25 01:31:04.628566


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 752 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.2min finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s


EndTime:  2019-01-25 01:32:16.043426
TotalTime:  71.41486382484436


[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:    0.3s
[Parallel(n_jobs=24)]: Done 800 out of 800 | elapsed:    0.4s finished


Fold 2 /  CV-Score: 3.692573

StartTime:  2019-01-25 01:32:16.640145


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done 752 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.2min finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s


EndTime:  2019-01-25 01:33:28.456097
TotalTime:  71.81598258018494


[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:    0.4s
[Parallel(n_jobs=24)]: Done 800 out of 800 | elapsed:    0.4s finished


Fold 3 /  CV-Score: 3.682249

StartTime:  2019-01-25 01:33:29.050039


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 752 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.2min finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s


EndTime:  2019-01-25 01:34:40.666760
TotalTime:  71.61674880981445


[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:    0.4s
[Parallel(n_jobs=24)]: Done 800 out of 800 | elapsed:    0.4s finished


Fold 4 /  CV-Score: 3.660616

StartTime:  2019-01-25 01:34:41.261594


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   35.6s


In [None]:
x_train_second_layer = np.concatenate((et_train, rf_train, lgbm1_train,
                         lgbm2_train, lgbm3_train, lgbm4_train,
                         xgb_train,cat_train), axis=1)

x_test_second_layer = np.concatenate((et_test, rf_test, lgbm1_test,
                         lgbm2_test, lgbm3_test, lgbm4_test,
                         xgb_test,cat_test), axis=1)
x_train = pd.DataFrame(x_train_second_layer)
x_test = pd.DataFrame(x_test_second_layer)

In [None]:
lgbm_ex_no = 31
lgbm_meta_params = {
    'boosting':'gbdt', 'num_leaves':28, 'learning_rate':0.03, 'min_sum_hessian_in_leaf':0.1,
    'max_depth':7, 'feature_fraction':0.6, 'min_data_in_leaf':70, 'poission_max_delta_step':0.7,
    'bagging_fraction':0.8, 'min_gain_to_split':0, 'scale_pos_weight':1.0,
    'lambda_l2':0.1, 'lambda_l1':0.1, 'fair_c':1.0, 'bagging_freq':1,
    'objective':'fair', 'seed':1, 'categorical_feature':0, 'xgboost_dart_mode':False,
    'drop_rate':0.1, 'skip_drop':0.5, 'max_drop':50, 'top_rate':0.1, 'other_rate':0.1,
    'max_bin':255, 'min_data_in_bin':50, 'bin_construct_sample_cnt':1000000,
    'two_round':False, 'uniform_drop':False,'metric': 'mae'
}

lgbm_meta_model = LgbmWrapper(params=lgbm_meta_params, num_rounds = 2000, ealry_stopping=100,
               verbose_eval=False, base_score=True, maximize=False, y_value_log=False)

lgbm_cv_score, best_round = kfold_test(lgbm_meta_model, x_train, y_train, rmse, NFOLDS=9, kfold_random_sate=4950 )


d_train_all = lgbm.Dataset(x_train, label=y_train)
bst = lgbm.train(lgbm_meta_params, d_train_all ,best_round)
#predictions = bst.predict(pd.DataFrame(x_test_second_layer))