## Modelling Related

In [495]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

### Cross Validation Split

In [None]:
class MyGroupKFold:
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
    
    def split(self, X, y=None, groups=None):
        groups = pd.Series(groups)
        unique_groups = np.unique(groups)
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        for tr_group_idx, va_group_idx in kf.split(unique_groups):
            tr_groups, va_groups = unique_groups[tr_group_idx], unique_groups[va_group_idx]
            tr_indices = groups[groups.isin(tr_groups)].index.to_list()
            va_indices = groups[groups.isin(va_groups)].index.to_list()
            yield tr_indices, va_indices
            
class StratifiedGroupKFold:
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        
    # Implementation based on this kaggle kernel:
    #    https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
    def split(self, X, y=None, groups=None):
        k = self.n_splits
        rnd = check_random_state(self.random_state)
            
        # labels_num: zero-origin number of label
        # ex) unique = [0,1,2,3] -> labels_num = 4
        labels_num = np.max(y) + 1
        
        # y_counts_per_group: in-group label distribution
        # y_distr: whole label distribution
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        # y_counts_per_fold: in-fold label distribution
        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)
        
        # return mean std of per label counts when y_counts is in fold
        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std(
                    [y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]
                )
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
        
        # list of [group, y_counts]
        # if shuffle: fold changes in same np.std(y_counts)
        # ascending groups by degree of label variance
        groups_and_y_counts = list(y_counts_per_group.items())
        if self.shuffle:
            rnd.shuffle(groups_and_y_counts)
        groups_and_y_counts = sorted(groups_and_y_counts, key=lambda x: -np.std(x[1]))

        # set fold for each group such that label distirbution will be uniform
        for g, y_counts in groups_and_y_counts:
            best_fold = None
            min_eval = None
            for i in range(k):
                fold_eval = eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(k):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_indices = [i for i, g in enumerate(groups) if g in train_groups]
            test_indices = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_indices, test_indices

In [496]:
def build_cv_spliter(
    X_train,
    y_train,
    strategy='stratified',
    n_splits=5,
    group=None,
    shuffle=True,
    seed=8982,
    return_indices=False,
):
    if strategy == 'kfold':
        kf = KFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train)
    elif strategy == 'stratified':
        kf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train, y_train)
    elif strategy == 'group':
        kf = MyGroupKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train, y_train, group)
    elif strategy == 'stratified-group':
        kf = StratifiedGroupKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train, y_train, group)
    else:
        raise NotImplementedError(f'strategy {strategy} not implemented.')

    if not return_indices:
        cv_spliter = []
        for dev_idx, val_idx in cv:
            cv_spliter.append([dev_idx, val_idx])
        return cv_spliter
    else:
        fold_indices = np.zeros(len(X), dtype=np.int64)
        for fold, (_, val_idx) in enumerate(cv):
            fold_indices[val_idx] = int(fold)
        return fold_indices

### LightGBM (Binary Classification: max auc)

In [500]:
import lightgbm as lgb
# from sklearn.model_selection import StratifiedKFold

In [498]:
def lgb_kfold_clf(X_train, y_train, category_cols, split, bayes_opt=True,
                  learning_rate=0.05, num_leaves=31, max_depth=-1,
                  bagging_fraction=0.9, feature_fraction=0.9,
                  min_child_weight=1e-3, min_data_in_leaf=20,
                  lambda_l1=0.0, lambda_l2=0.0):
    metric='auc'
    params = {'objective': 'binary',
              'metric': metric,
              'boosting': 'gbdt',
              'seed': 8982,
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'bagging_freq': int(5),
              'bagging_fraction': bagging_fraction,
              'feature_fraction': feature_fraction,
              'min_child_weight': min_child_weight,   
              'min_data_in_leaf': int(min_data_in_leaf),
              'lambda_l1': lambda_l1,
              'lambda_l2': lambda_l2}
              #'verbosity': int(-1)}
             
    #cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
    #print(cat_features)
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    models = []; learning_curves = []; best_scores = []; valid_score = []
    feature_importance_df = pd.DataFrame()
    
    print(f'========== LightGBM Classifier training on : {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        d_train = lgb.Dataset(X_train.iloc[train_idx,:], label=y_train[train_idx], categorical_feature=category_cols)
        d_valid = lgb.Dataset(X_train.iloc[valid_idx,:], label=y_train[valid_idx], categorical_feature=category_cols)
        
        print(f'========== LightGBM Classifier training: {i+1}/{n_splits} fold ==========')
        learning_curve = {}
        model = lgb.train(params,
                          train_set=d_train,
                          valid_sets=[d_train, d_valid],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          evals_result=learning_curve,
                          verbose_eval=200#False,
                          )
        best_score = {f'train_{metric}': model.best_score['training'][f'{metric}'],
                      f'valid_{metric}': model.best_score['valid_1'][f'{metric}']}
        print()
        oofs[valid_idx] = model.predict(X_train.iloc[valid_idx,:], num_iteration=model.best_iteration)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_score.append(best_score[f'valid_{metric}'])
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        del d_train, d_valid, fold_importance_df
        gc.collect()
        
    valid_std_score = np.std(valid_score)
    valid_avg_score = np.mean(valid_score)
    print('====================')
    print(f'CV AVG: {metric} - {valid_avg_score}')
    print(f'CV STD: {metric} - {valid_std_score}')
    print('====================')

    if bayes_opt:
        return valid_avg_score
    else:
        return oofs, models, feature_importance_df #best_scores, learning_curves

def lgb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (20, 500), 
              #'max_depth': (-1, 250),
              'bagging_fraction' : (0.1, 1),
              'feature_fraction' : (0.1, 1),
              'min_child_weight': (0.001, 0.99),   
              'min_data_in_leaf': (3, 700),
              'lambda_l1': (0.1, 300), 
              'lambda_l2': (0.1, 300)}
    
    optimizer = BayesianOptimization(f=lgb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = optimizer.max['target']
    return param, cv

In [None]:
def lgb_pred_clf(X_test, models, threshold):
    y_test_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== LightGBM Predicting with {i+1}-th model ==========')
        y_test = model.predict(X_test, num_iteration=model.best_iteration)
        y_test_total += y_test
    y_test_total /= len(models)
    y_test = np.where(y_test_total > threshold, 1, 0)
    gc.collect()
    return y_test

### LightGBM (Regression: min rmse)

In [125]:
import lightgbm as lgb
# from sklearn.model_selection import KFold

In [147]:
def lgb_kfold_reg(X_train, y_train, category_cols, split, bayes_opt=True,
                  learning_rate=0.05, num_leaves=31, max_depth=-1,
                  bagging_fraction=0.9, feature_fraction=0.9,
                  min_child_weight=1e-3, min_data_in_leaf=20,
                  lambda_l1=0.0, lambda_l2=0.0):
    metric='rmse'
    params = {'objective': 'regression',
              'metric': metric,
              'boosting': 'gbdt',
              'seed': 8982,
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'bagging_freq': int(5),
              'bagging_fraction': bagging_fraction,
              'feature_fraction': feature_fraction,
              'min_child_weight': min_child_weight,   
              'min_data_in_leaf': int(min_data_in_leaf),
              'lambda_l1': lambda_l1,
              'lambda_l2': lambda_l2}
              #'verbosity': int(-1)}
             
    #cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
    #print(cat_features)
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    models = []; learning_curves = []; valid_scores = []
    feature_importance_df = pd.DataFrame()
    
    print(f'========== LightGBM Regressor training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        d_train = lgb.Dataset(X_train.iloc[train_idx,:], label=y_train[train_idx], categorical_feature=category_cols)
        d_valid = lgb.Dataset(X_train.iloc[valid_idx,:], label=y_train[valid_idx], categorical_feature=category_cols)
        
        print(f'========== LightGBM Regressor training: {i+1}/{n_splits} fold ==========')
        learning_curve = {}
        model = lgb.train(params,
                          train_set=d_train,
                          valid_sets=[d_train, d_valid],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          evals_result=learning_curve,
                          verbose_eval=200#False,
                          )
        print()
        oofs[valid_idx] = model.predict(X_train.iloc[valid_idx,:], num_iteration=model.best_iteration)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_scores.append(model.best_score['valid_1'][f'{metric}'])
          
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)  
        
        del d_train, d_valid, fold_importance_df
        gc.collect()


    valid_std_score = np.std(valid_scores)
    valid_avg_score = np.mean(valid_scores)
    print('====================')
    print(f'CV AVG: {metric} - {valid_avg_score}')
    print(f'CV STD: {metric} - {valid_std_score}')
    print('====================')
    
    if bayes_opt:
        return -valid_avg_score
    else:
        return oofs, models, feature_importance_df #best_scores, learning_curves

def lgb_reg_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (20, 500),
              #'max_depth': (-1, 250),
              'bagging_fraction' : (0.1, 1),
              'feature_fraction' : (0.1, 1),
              'min_child_weight': (0.001, 0.99),   
              'min_data_in_leaf': (3, 700),
              'lambda_l1': (0.1, 300), 
              'lambda_l2': (0.1, 300)}
    
    optimizer = BayesianOptimization(f=lgb_kfold_reg, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', -optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = -optimizer.max['target']
    return param, cv

In [129]:
def lgb_pred_reg(X_test, models):
    y_test_pred_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== LightGBM Predicting with {i+1}-th model ==========')
        y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
        y_test_pred_total += y_pred_test
    y_test_pred_total /= len(models)
    return y_test_pred_total

### CatBoost (Binary Classification: max AUC)

In [1]:
import catboost as cb

ModuleNotFoundError: No module named 'catboost'

In [152]:
def cb_kfold_clf(X_train, y_train, category_cols, split, bayes_opt=True,
                learning_rate=0.03, num_leaves=31, max_depth=6,
                subsample=0.8, bagging_temperature=1.0, colsample_bylevel=1.0,
                min_data_in_leaf=1, l2_leaf_reg=3.0, random_strength=1.0):
    loss = 'Logloss'
    metric = 'AUC'
    params = {'loss_function': loss,
              'eval_metric': metric,
              'boosting_type': 'Plain',
              'random_seed': 8982,
              'num_boost_round': 5000,
              'early_stopping_rounds': 20,
              'use_best_model': True,
              # 'grow_policy': 'SymmetricTree','Depthwise','Lossguide',
              'nan_mode': 'Max',
              'od_type': 'Iter',
              'verbose': 200,
              
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'subsample': subsample, #bf?
              'bagging_temperature': bagging_temperature, #bf?
              'colsample_bylevel': colsample_bylevel, #ff
              'min_data_in_leaf': int(min_data_in_leaf),
              'l2_leaf_reg': l2_leaf_reg,
              'random_strength': random_strength}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    feature_importance_df = pd.DataFrame()
    
    models = []; learning_curves = []; valid_losses = []; valid_metrics = []
    print(f'========== CatBoost Classifier training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== CatBoost Classifier training: {i+1}/{n_splits} ==========')
        train_d = cb.Pool(data=X_train.loc[train_idx],
                          label=y_train[train_idx],
                          cat_features=category_cols)
        valid_d = cb.Pool(data=X_train.loc[valid_idx],
                          label=y_train[valid_idx],
                          cat_features=category_cols)
        
        model = cb.CatBoostClassifier(**params)
        model.fit(train_d, eval_set=valid_d)
        
        oofs[valid_idx] = model.predict_proba(X_train.loc[valid_idx])[:,1]
        models.append(model)
        valid_losses.append(model.best_score_['validation'][f'{loss}'])
        valid_metrics.append(model.best_score_['validation'][f'{metric}'])
          
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.get_feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
          
        del train_d, valid_d, model, fold_importance_df
        gc.collect()
        
    print('====================')
    print(f'CV AVG:\n{loss} - {np.mean(valid_losses)}\n{metric} - {np.mean(valid_metrics)}')
    print(f'CV STD:\n{loss} - {np.std(valid_losses)}\n{metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return np.mean(valid_metrics)
    else:
        return oofs, models, feature_importance_df

def cb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (16, 288), 
              'max_depth': (3, 16),
              'subsample' : (0.1, 1),
              'bagging_temperature' : (0, 100),
              'colsample_bylevel': (0.001, 1),   
              'min_data_in_leaf': (3, 700),
              'l2_leaf_reg': (0.1, 300), 
              'random_strength': (0, 100)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = optimizer.max['target']
    return param, cv

In [168]:
def cb_pred_clf(X_test, models, threshold):
    y_test_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== CatBoost Predicting with {i+1}-th model ==========')
        y_test = model.predict_proba(X_test)[:,1]
        y_test_total += y_test
    y_test_total /= len(models)
    y_test = np.where(y_test_total > threshold, 1, 0)
    gc.collect()
    return y_test

def lgb_pred_clf(X_test, models, threshold):
    y_test_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== LightGBM Predicting with {i+1}-th model ==========')
        y_test = model.predict(X_test, num_iteration=model.best_iteration)
        y_test_total += y_test
    y_test_total /= len(models)
    y_test = np.where(y_test_total > threshold, 1, 0)
    return y_test

### CatBoost (Regression: min RMSE)

In [145]:
def cb_kfold_reg(X_train, y_train, category_cols, split, bayes_opt=True,
                learning_rate=0.03, num_leaves=31, max_depth=6,
                subsample=0.8, bagging_temperature=1.0, colsample_bylevel=1.0,
                min_data_in_leaf=1, l2_leaf_reg=3.0, random_strength=1.0):
    loss = 'RMSE'
    metric = 'RMSE'
    params = {'loss_function': loss,
              'eval_metric': metric,
              'boosting_type': 'Plain',
              'random_seed': 8982,
              'num_boost_round': 5000,
              'early_stopping_rounds': 20,
              'use_best_model': True,
              # 'grow_policy': 'SymmetricTree','Depthwise','Lossguide',
              'nan_mode': 'Max',
              'od_type': 'Iter',
              'verbose': 200,
              
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'subsample': subsample, #bf?
              'bagging_temperature': bagging_temperature, #bf?
              'colsample_bylevel': colsample_bylevel, #ff
              'min_data_in_leaf': int(min_data_in_leaf),
              'l2_leaf_reg': l2_leaf_reg,
              'random_strength': random_strength}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    feature_importance_df = pd.DataFrame()
    
    models = []; valid_losses = []; valid_metrics = []
    print(f'========== CatBoost Regressor training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== CatBoost Regressor training: {i+1}/{n_splits} ==========')
        train_d = cb.Pool(data=X_train.loc[train_idx],
                          label=y_train[train_idx],
                          cat_features=category_cols)
        valid_d = cb.Pool(data=X_train.loc[valid_idx],
                          label=y_train[valid_idx],
                          cat_features=category_cols)
        
        model = cb.CatBoostRegressor(**params)
        model.fit(train_d, eval_set=valid_d)
        
        oofs[valid_idx] = model.predict(X_train.loc[valid_idx])
        models.append(model)
        # valid_losses.append(model.best_score_['validation'][f'{loss}'])
        valid_metrics.append(model.best_score_['validation'][f'{metric}'])
          
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.get_feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
          
        del train_d, valid_d, model, fold_importance_df
        gc.collect()
    
    assert loss == metric
    print('====================')
    print(f'CV AVG: {metric} - {np.mean(valid_metrics)}')
    print(f'CV STD: {metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return -np.mean(valid_metrics)
    else:
        return oofs, models, feature_importance_df

def cb_reg_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (16, 288), 
              'max_depth': (3, 16),
              'subsample' : (0.1, 1),
              'bagging_temperature' : (0, 100),
              'colsample_bylevel': (0.001, 1),   
              'min_data_in_leaf': (3, 700),
              'l2_leaf_reg': (0.1, 300), 
              'random_strength': (0, 100)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', -optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = -optimizer.max['target']
    return param, cv

In [169]:
def cb_pred_reg(X_test, models):
    y_test_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== CatBoost Predicting with {i+1}-th model ==========')
        y_test = model.predict(X_test)
        y_test_total += y_test
    y_test_total /= len(models)
    return y_test_total

### XGBoost (Binary Classification: max auc)

In [77]:
import xgboost as xgb
# encode categorical cols beforehand!

In [2]:
def xgb_kfold_clf(X_train, y_train, split, bayes_opt=True,
                  eta=0.3, gamma=0, max_depth=6, min_child_weight=1,
                  subsample=0.7, colsample_bytree=1.0, colsample_bylevel=1.0,
                  colsample_bynode=1.0, reg_lambda=1.0, reg_alpha=0.0):
    metric = 'auc'
    params = {'objective': 'binary:logistic',
              'eval_metric': metric,
              'booster': 'gbtree', # 'dart',
              'seed': 8982,

              'missing': np.nan,
              # when dart: 'rate_drop': (0.0, 1.0),
              # 'grow_policy': 'depthwise','lossguide',
              # 'verbosity': 1, #0
              # 'base_score': 0.5 <- initial leaf prediction
              
              'eta': eta,
              'gamma': gamma, # pruning phase of split in XGBoost unique tree
              'max_depth': int(max_depth),
              'min_child_weight': min_child_weight,
              'subsample': subsample, # 0.5 - randomly sample half of the training data prior to growing trees
              'colsample_bytree': colsample_bytree, # subsample ratio of columns when constructing each tree
              'colsample_bylevel': colsample_bylevel, # subsample ratio of columns for each level
              'colsample_bynode': colsample_bynode, # subsample ratio of columns for each node (split)
              'reg_lambda': reg_lambda,
              'reg_alpha': reg_alpha}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    
    models = []; learning_curves = []; valid_losses = []; valid_metrics = []
    print(f'========== XGBoost Classifier training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== XGBoost Classifier training: {i+1}/{n_splits} ==========')
        train_d = xgb.DMatrix(data=X_train.loc[train_idx],
                              label=y_train[train_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        valid_d = xgb.DMatrix(data=X_train.loc[valid_idx],
                              label=y_train[valid_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        learning_curve = {}
        model = xgb.train(params,
                          train_d,
                          evals=[(train_d, 'train'), (valid_d, 'valid')],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          verbose_eval=200,#False
                          evals_result=learning_curve)
        oofs[valid_idx] = model.predict(valid_d, ntree_limit=model.best_ntree_limit)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_metrics.append(model.best_score)
          
        del train_d, valid_d, model
        gc.collect()
        
    print('====================')
    print(f'CV AVG: {metric} - {np.mean(valid_metrics)}')
    print(f'CV STD: {metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return np.mean(valid_metrics)
    else:
        return oofs, models #, learning_curves

def cb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'eta': (0.001, 0.3),
              'gamma': (0, 10),
              'max_depth': (3, 250),
              'min_child_weight': (0, 100),
              'subsample': (0.1, 1),
              'colsample_bytree': (0.1, 1),
              'colsample_bylevel': (0.1, 1),
              'colsample_bynode': (0.1, 1),
              'reg_lambda': (0, 300),
              'reg_alpha': (0, 300)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = optimizer.max['target']
    return param, cv

In [3]:
def xgb_pred_reg(X_test, models, threshold):
    y_test_total = np.zeros(X_test.shape[0])
    test_d = xgb.DMatrix(X_test)
    for i, model in enumerate(models):
        print(f'========== XGBoost Predicting with {i+1}-th model ==========')
        y_test = model.predict(test_d, ntree_limit=model.best_ntree_limit)
        y_test_total += y_test
    y_test_total /= len(models)
    y_test = np.where(y_test_total > threshold, 1, 0)
    return y_test

### XGBoost (Regression: max rmse)

In [192]:
def xgb_kfold_reg(X_train, y_train, split, bayes_opt=True,
                  eta=0.3, gamma=0, max_depth=6, min_child_weight=1,
                  subsample=0.7, colsample_bytree=1.0, colsample_bylevel=1.0,
                  colsample_bynode=1.0, reg_lambda=1.0, reg_alpha=0.0):
    metric = 'rmse'
    params = {'objective': 'reg:squarederror',
              'eval_metric': metric,
              'booster': 'gbtree', # 'dart',
              'seed': 8982,

              'missing': np.nan,
              # when dart: 'rate_drop': (0.0, 1.0),
              # 'grow_policy': 'depthwise','lossguide',
              # 'verbosity': 1, #0
              # 'base_score': 0.5 <- initial leaf prediction
              
              'eta': eta,
              'gamma': gamma, # pruning phase of split in XGBoost unique tree
              'max_depth': int(max_depth),
              'min_child_weight': min_child_weight,
              'subsample': subsample, # 0.5 - randomly sample half of the training data prior to growing trees
              'colsample_bytree': colsample_bytree, # subsample ratio of columns when constructing each tree
              'colsample_bylevel': colsample_bylevel, # subsample ratio of columns for each level
              'colsample_bynode': colsample_bynode, # subsample ratio of columns for each node (split)
              'reg_lambda': reg_lambda,
              'reg_alpha': reg_alpha}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    
    models = []; learning_curves = []; valid_losses = []; valid_metrics = []
    print(f'========== XGBoost Regressor training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== XGBoost Regressor training: {i+1}/{n_splits} ==========')
        train_d = xgb.DMatrix(data=X_train.loc[train_idx],
                              label=y_train[train_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        valid_d = xgb.DMatrix(data=X_train.loc[valid_idx],
                              label=y_train[valid_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        learning_curve = {}
        model = xgb.train(params,
                          train_d,
                          evals=[(train_d, 'train'), (valid_d, 'valid')],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          verbose_eval=200,#False
                          evals_result=learning_curve)
        oofs[valid_idx] = model.predict(valid_d, ntree_limit=model.best_ntree_limit)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_metrics.append(model.best_score)
          
        del train_d, valid_d, model
        gc.collect()
        
    print('====================')
    print(f'CV AVG: {metric} - {np.mean(valid_metrics)}')
    print(f'CV STD: {metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return -np.mean(valid_metrics)
    else:
        return oofs, models #, learning_curves

def xgb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'eta': (0.001, 0.3),
              'gamma': (0, 10),
              'max_depth': (3, 250),
              'min_child_weight': (0, 100),
              'subsample': (0.1, 1),
              'colsample_bytree': (0.1, 1),
              'colsample_bylevel': (0.1, 1),
              'colsample_bynode': (0.1, 1),
              'reg_lambda': (0, 300),
              'reg_alpha': (0, 300)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', -optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = -optimizer.max['target']
    return param, cv

In [183]:
def xgb_pred_reg(X_test, models):
    y_test_total = np.zeros(X_test.shape[0])
    test_d = xgb.DMatrix(X_test)
    for i, model in enumerate(models):
        print(f'========== XGBoost Predicting with {i+1}-th model ==========')
        y_test = model.predict(test_d, ntree_limit=model.best_ntree_limit)
        y_test_total += y_test
    y_test_total /= len(models)
    return y_test_total

### Neural Net

In [224]:
import tensorflow as tf
import tensorflow.keras as keras
# import torch

In [257]:
def build_neuralnet(
    recipe,
    loss='mse',
    optimizer='adam',
    lr=1e-3,
    monitor='val_loss',
    es_patience=-1,
    restore_best_weights=True,
    lr_scheduler='none',
    lr_factor=0.1,
    lr_patience=5,
    seed=8982,
    **_,
):
    tf.set_random_seed(seed)
    model = keras.models.model_from_json(recipe)
    
    if loss == 'mse':
        loss = keras.losses.mean_squared_error
    elif loss == 'bce':
        loss = keras.losses.binary_crossentropy
    else:
        raise NotImplementedError
    
    if optimizer == 'adam':
        optimizer = keras.optimizers.Adam(lr)
    else:
        raise NotImplementedError
    
    model.compile(optimizer=optimizer, loss=loss)
    
    callbacks = []
    
    if es_patience >= 0:
        es = keras.callbacks.EarlyStopping(monitor=monitor,
                                           patience=es_patience,
                                           restore_best_weights=restore_best_weights,
                                           verbose=1)
        callbacks.append(es)
    
    if lr_scheduler == 'none':
        pass
    elif lr_scheduler == 'reduce_on_plateau':
        lr_sche = keras.callbacks.ReduceLROnPlateau(monitor=monitor,
                                                    factor=lr_factor,
                                                    patience=lr_patience,
                                                    verbose=1)
        callbacks.append(lr_sche)
    else:
        raise NotImplementedError
    
    return model, callbacks


def train_neuralnet(
    params,
    X_train,
    y_train,
    validation_data=None,
):
    model, callbacks = build_neuralnet(**params)
    model.fit(X_train,
              y_train,
              validation_data=validation_data,
              batch_size=params['batch_size'],
              epochs=params['epochs'],
              callbacks=callbacks)
    return model

def run_kfold_neuralnet(
    params,
    X_train,
    y_train,
    X_test,
    cv,
    features,
    metrics,
):
    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_test))
    
    n_splits = len(cv)
    print(f"k={n_splits} folds neuralnet running...")
    print(f"train data/feature shape: {X_train[features].shape}")
    
    for fold, (dev_idx, val_idx) in enumerate(cv):
        validation_data = [X_train.loc[val_idx, features], y_train[val_idx]]
        model = train_neuralnet(params,
                                X_train.loc[dev_idx, features],
                                y_train[dev_idx],
                                validation_data=validation_data)
        
        oof[val_idx] = model.predict(X_train.loc[val_idx, features].values)[:,0]
        predictions += model.predict(X_test[features].values)[:,0] / n_splits
        
        msg = f'fold: {fold}'
        for name, func in metrics.items():
            score = func(y_train[val_idx], oof[val_idx])
            msg += f' - {name}: {score:.5f}'
        print(msg)
    
    msg = f'CV score'
    for name, func in metrics.items():
        score = func(y_train, oof)
        msg += f' - {name}: {score:.5f}'
    print(msg)

    return oof, predictions

In [261]:
features_nn = list(set(x_tr.columns) - {'label'})
x_tr['cat_STD'].fillna(0, inplace=True)

model = keras.Sequential([
    keras.layers.Input(shape=(len(features_nn,))),
    keras.layers.BatchNormalization(),
  
    keras.layers.Dense(256),
    keras.layers.BatchNormalization(),
    keras.layers.PReLU(),
    keras.layers.Dropout(0.1),
  
    keras.layers.Dense(256),
    keras.layers.BatchNormalization(),
    keras.layers.PReLU(),
    keras.layers.Dropout(0.1),
     keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

cat                0
label              0
cat_MEAN           0
cat_SMTH_MEAN_1    0
cat_SMTH_MEAN_5    0
cat_MAX            0
cat_MIN            0
cat_RNG            0
cat_STD            0
cat_Q1             0
cat_Q2             0
cat_Q3             0
cat_IQR            0
dtype: int64
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_15 (Batc (None, 12)                48        
_________________________________________________________________
dense_15 (Dense)             (None, 256)               3328      
_________________________________________________________________
batch_normalization_16 (Batc (None, 256)               1024      
_________________________________________________________________
p_re_lu_10 (PReLU)           (None, 256)               256       
_________________________________________________________________
dropout_10 (Dropout)         (No

In [262]:
SEED = 8982

params = {
    'recipe': model.to_json(),
    'optimizer': 'adam',
    'lr': 1e-3,
    'loss': 'bce',
    'monitor': 'val_loss',
    'es_patience': 20,
    'restore_best_weights': True,
    'lr_scheduler': 'reduce_on_plateau',
    'lr_factor': 0.1,
    'lr_patience': 5,
    'seed': SEED,
 
    'epochs': 100,
    'batch_size': 64,
}
print(features_nn)
X_train_nn = x_tr[features_nn]
y_train_nn = x_tr['label'].values
X_test_nn = x_te[features_nn]

from sklearn.metrics import roc_auc_score
metrics = {'auc_ome3': roc_auc_score}

seed_list = SEED + np.arange(3)
oof_nn, predictions_nn = run_kfold_neuralnet(params,
                                               X_train_nn,
                                               y_train_nn,
                                               X_test_nn,
                                             build_cv_spliter(X_train_nn, y_train_nn, n_splits=2),
                                               features_nn,
                                               metrics)
                                               #seed_list=seed_list)

['cat_SMTH_MEAN_5', 'cat_Q3', 'cat_MIN', 'cat_STD', 'cat_Q2', 'cat_RNG', 'cat_IQR', 'cat_MAX', 'cat_SMTH_MEAN_1', 'cat_MEAN', 'cat', 'cat_Q1']
k=2 folds neuralnet running...
train data/feature shape: (6, 12)
Train on 2 samples, validate on 4 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch