## Env Related

### Reduce DataFrame Memory Usage

In [1]:
def reduce_mem_usage(df, use_float16=False):
    from pandas.api.types import is_datetime64_any_dtype as is_datetime
    from pandas.api.types import is_categorical_dtype
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### Parallelize DataFrame

In [2]:
def df_parallelize_run(df, func):
    import multiprocessing
    num_partitions, num_cores = psutil.cpu_count(), psutil.cpu_count()
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

## Feature Related (Categorical)

### Concatenate Categorical Columns

In [3]:
def category_concat(df, subject_cols, print_option=True):
    na_col = list(df.columns[df.isna().any()])
    for col in na_col:
        df[col].fillna('', inplace=True)
    temp_str = ''
    for col in subject_cols:
        temp_str += '_' + col
    df[temp_str[1:]] = ''
    for col in subject_cols:
        df[temp_str[1:]] += df[col]
    
    if print_option:
        print("Generated features: category_concat")
        print(f"'{temp_str[1:]}',")
        print()
    del na_col, temp_str, col; gc.collect()

### Target Encoder

In [151]:
class apply_target_encode:
    #def __init__(self, target_col, cat_col):
    #    self.target_col = target_col
    #    self.cat_col = cat_col
    
    # (for train/train in oof_te)
    # fit grouped label stats of given df
    def fit_train(df, target_col, cat_col, m=0, statistic=False):
        # df[target_col] = np.log1p(df[target_col])
        df_group = df.groupby(cat_col)[target_col]
        group_mean = df_group.mean().astype(np.float16)
        temp_stat = []

        # ===== smoothing =====
        if m > 0:
            global_mean = df[target_col].mean()
            group_count = df_group.count().astype(np.float16)
            smoother = ((group_count * group_mean) + (m * global_mean)) / (group_count + m)
            temp_mean = (smoother, f'SMTH_MEAN_{m}')

        # ===== no smoothing =====
        elif m == 0:
            temp_mean = (group_mean, 'MEAN')

        # ===== more target statistic =====
        if statistic:
            group_min = df_group.min().astype(np.float16)
            group_max = df_group.max().astype(np.float16)
            group_std = df_group.std().astype(np.float16)
            group_rng = group_max - group_min
            group_Q1 = df_group.quantile(0.25).astype(np.float16)
            group_Q2 = df_group.median().astype(np.float16)
            group_Q3 = df_group.quantile(0.75).astype(np.float16)
            group_IQR = group_Q3 - group_Q1
            temp_stat = [(group_max, 'MAX'), (group_min, 'MIN'),
                         (group_rng, 'RNG'), (group_std, 'STD'),
                         (group_Q1, 'Q1'), (group_Q2, 'Q2'),
                         (group_Q3, 'Q3'), (group_IQR, 'IQR')]

        temp_stat.append(temp_mean)
        return temp_stat
    
    # (for train/valid in oof_te)
    # transform (encode) given df via given grouped label stats from fit_train
    def transform_valid(temp_stat, df, cat_col, print_option=True):
        for mapper, agg_str in temp_stat:
            df[f'{cat_col}_{agg_str}'] = df[f'{cat_col}'].map(mapper)
            if print_option:
                print(f"'{cat_col}_{agg_str}',")
    
    # (for ordinary use or test in oof_te)
    # fit_train and tranform_valid combined
    def fit_transform(df, test_df, target_col, cat_col, m=0, statistic=False, print_option=True):
        temp_stat = apply_target_encode.fit_train(df, target_col, cat_col, m, statistic)
        for mapper, agg_str in temp_stat:
            df[f'{cat_col}_{agg_str}'] = df[f'{cat_col}'].map(mapper)
            test_df[f'{cat_col}_{agg_str}'] = test_df[f'{cat_col}'].map(mapper)
            if print_option:
                print(f"'{cat_col}_{agg_str}',")
    
    # train/train: fit grouped label statistic (with fit_train)
    # train/valid: encode via the fitted (with transform_valid)
    # test: fit with entire train, encode to test
    # note: smoothing is universally applied.
    # for diff m, utilize each function flexibly

def oof_te(df, test_df, target_col, cat_col, split, m=0, statistic=False, print_option=True):
    # train oof target encode
    for train_idx, valid_idx in split:
        temp_stat = apply_target_encode.fit_train(df=df.loc[train_idx, :],
                                                  target_col=target_col,
                                                  cat_col=cat_col,
                                                  m=m,
                                                  statistic=statistic)
        apply_target_encode.transform_valid(temp_stat=temp_stat,
                                            df=df.loc[valid_idx, :],
                                            cat_col=cat_col,
                                            print_option=False)

    # test oof (=train) target encode
    apply_target_encode.fit_transform(df=df,
                                      test_df=test_df,
                                      target_col=target_col,
                                      cat_col=cat_col,
                                      m=m,
                                      statistic=statistic,
                                      print_option=print_option)
    return df
import pandas as pd
import numpy as np
x_tr = pd.DataFrame(np.array([['a','a','a','a','b','b','b','b','c','c','c','c',],[1,1,1,0,1,1,0,0,1,0,0,0,]]).transpose(), columns=['cat', 'label'])
x_tr['label'] = x_tr.label.astype(np.int8)
x_te = pd.DataFrame(['a', 'a', 'b', 'b', 'c'], columns=['cat'])
cv = [[[0,1,4,5,8,9],[2,3,7,8,10]],[[2,3,7,8,10],[0,1,4,5,8,9]]]
oof_te(x_tr, x_te, 'label', 'cat', split=cv, m=0, statistic=False, print_option=True)


"""
for fold, (train_idx, valid_idx) in enumerate(cv):
    print(fold)
    print(x_tr.loc[train_idx,:])
    print(x_tr.loc[valid_idx,:])

    temp_stat = apply_target_encode.fit_train(df=x_tr.loc[train_idx, :],
                                                      target_col='label',
                                                      cat_col='cat',
                                                      m=0,
                                                      statistic=False)
    print(temp_stat)
    apply_target_encode.transform_valid(temp_stat=temp_stat,
                                        df=x_tr.loc[valid_idx, :],
                                        cat_col='cat',
                                        print_option=True)
    assert 'cat_MEAN' in x_tr
    print(fold)
    print(x_tr.loc[train_idx,:])
    print(x_tr.loc[valid_idx,:])"""
x_tr

'cat_MEAN',


Unnamed: 0,cat,label,cat_MEAN
0,a,1,0.75
1,a,1,0.75
2,a,1,0.75
3,a,0,0.75
4,b,1,0.5
5,b,1,0.5
6,b,0,0.5
7,b,0,0.5
8,c,1,0.25
9,c,0,0.25


### Label Encode (not ordinal)

In [5]:
def apply_label_encode(df, subject_cols):
    for str_col in subject_cols:
        # ===== assumes Series of string =====
        temp_dict = {value: i for i, value in enumerate(df[str_col].unique())}
        df[str_col] = (df[str_col].map(temp_dict)).astype(np.int32)
    del temp_dict, str_col; gc.collect()

### Ratio Encode

In [6]:
def apply_freq_encode(df, str_col, print_option=True):
    temp_dict = {sample: df.loc[df[str_col]==sample].shape[0] for sample in df[str_col].unique()}
    df[f'{str_col}_COUNT'] = df[str_col].map(temp_dict)
    df[f'{str_col}_RATIO'] = df[str_col].map(temp_dict) / df[str_col].shape[0]
    
    if print_option:
        print(f"'{str_col}_COUNT',")
        print(f"'{str_col}_RATIO',")
        print()
    del temp_dict; gc.collect()

## Feature Related (Numerical)

In [7]:
# aggs = {
#     'uid': ['count'],
#     'is_manual': ['sum', 'mean'],
#     'elapsed_days_succeeded_created': ['mean', 'std', 'max', 'min'],
#     'elapsed_days_created_premium': ['mean', 'std', 'max', 'min'],
#     'elapsed_days_created': ['mean', 'std', 'max', 'min'],
#     'elapsed_days_succeeded_premium': ['mean', 'std', 'max', 'min'],
#     'elapsed_days_succeeded': ['mean', 'std', 'max', 'min'],
#     'created_before_premium': ['sum', 'mean'],
#     'created_after_premium': ['sum', 'mean'],
#     'succeeded_before_premium': ['sum', 'mean'],
#     'succeeded_before_premium': ['sum', 'mean'],
# }
# aggs.update({col: ['sum', 'mean'] for col in service_category_id_cols})

# group_account_df = account_df.groupby(ID).agg(aggs)
# group_account_df.columns = [f'{k}_{v.upper()}' for k, vs in aggs.items() for v in vs]
# group_account_df = group_account_df.reset_index()

### Cyclical Encode

In [8]:
def apply_cyclical(df, str_col):
    # e.g. df['hr'] = df.timestamp.dt.hour; apply_cyclical(df, 'hr')
    # ===== assumes integer array =====
    # ===== assumes min and max exists in array =====
    temp = pd.DataFrame()
    temp['unique_sorted'] = (df[str_col] - df[str_col].min()).sort_values().unique()
    int_max = temp.unique_sorted.max()
    temp['sin'] = np.sin(2 * np.pi * temp.unique_sorted / int_max)
    temp['cos'] = np.cos(2 * np.pi * temp.unique_sorted / int_max)
    temp = temp.set_index('unique_sorted')
    df[f'{str_col}_sin'] = (df[str_col] - df[str_col].min()).map(temp.sin)
    df[f'{str_col}_cos'] = (df[str_col] - df[str_col].min()).map(temp.cos)
    del temp, int_max; gc.collect()

### Rolling Statistic

In [9]:
def apply_mov_stat(df, str_col, list_windows, fix=False, print_option=True):

    # ===== assumes timestamp is aligned =====
    for win in list_windows:
        rolled = df[str_col].rolling(window=win, min_periods=0)
        mov_avg = rolled.mean().reset_index() #.astype(np.float16)
        mov_max = rolled.max().reset_index() #.astype(np.float16)
        mov_min = rolled.min().reset_index() #.astype(np.float16)
        mov_std = rolled.std().reset_index() #.astype(np.float16)
        if win >= 4:
            mov_Q1 = rolled.quantile(0.25).reset_index() #.astype(np.float16)
            mov_Q2 = rolled.quantile(0.5).reset_index() #.astype(np.float16)
            mov_Q3 = rolled.quantile(0.75).reset_index() #.astype(np.float16)

        if fix:
            formula = int((win/2) - win)
            df[f'{str_col}_movavg_{win}'] = mov_avg[f'{str_col}'].shift(formula)
            df[f'{str_col}_movmax_{win}'] = mov_max[f'{str_col}'].shift(formula)
            df[f'{str_col}_movmin_{win}'] = mov_min[f'{str_col}'].shift(formula)
            df[f'{str_col}_movstd_{win}'] = mov_std[f'{str_col}'].shift(formula)
            if win >= 4:
                df[f'{str_col}_movQ1_{win}'] = mov_Q1[f'{str_col}'].shift(formula)
                df[f'{str_col}_movQ2_{win}'] = mov_Q2[f'{str_col}'].shift(formula)
                df[f'{str_col}_movQ3_{win}'] = mov_Q3[f'{str_col}'].shift(formula)
            print()
            del formula
        else:
            df[f'{str_col}_movavg_{win}'] = mov_avg[f'{str_col}']
            df[f'{str_col}_movmax_{win}'] = mov_max[f'{str_col}']
            df[f'{str_col}_movmin_{win}'] = mov_min[f'{str_col}']
            df[f'{str_col}_movstd_{win}'] = mov_std[f'{str_col}']
            if win >= 4:
                df[f'{str_col}_movQ1_{win}'] = mov_Q1[f'{str_col}']
                df[f'{str_col}_movQ2_{win}'] = mov_Q2[f'{str_col}']
                df[f'{str_col}_movQ3_{win}'] = mov_Q3[f'{str_col}']
            print()
        
        if print_option:
            print('Generated features: apply_mov_stat')
            print(f"'{str_col}_movavg_{win}',")
            print(f"'{str_col}_movmax_{win}',")
            print(f"'{str_col}_movmin_{win}',")
            print(f"'{str_col}_movstd_{win}',")
            if win >= 4:
                print(f"'{str_col}_movQ1_{win}',")
                print(f"'{str_col}_movQ2_{win}',")
                print(f"'{str_col}_movQ3_{win}',")
            print()
            
    del win, rolled, mov_avg, mov_max, mov_min, mov_std; gc.collect()
    if any([val for val in list_windows if val >= 4]):
        del mov_Q1, mov_Q2, mov_Q3; gc.collect()

### Nonlinear (log1p) Transformation

In [10]:
def apply_nonlinear(df, subject_cols):
    for col in subject_cols:
        temp_count = df[f'{col}'].isna().sum()
        df[f'{col}'] = np.log1p(df[f'{col}'])
        if df[f'{col}'].isna().sum() > temp_count:
            print(f"New nan in '{col}' via apply_nonlinear")
    del col, temp_count; gc.collect()

### Shift Features

In [11]:
def apply_shift_feature(df, subject_cols, list_shift, print_option=True):
    for col in subject_cols:
        for step in list_shift:
            df[f'{col}_shift_{step}'] = df[col].shift(int(step))
            
    if print_option:
        print('Generated features: apply_shift_feature')
        for col in subject_cols:
            for step in list_shift:
                print(f"'{col}_shift_{step}',")
        print()
        
    del col, step; gc.collect()

### Oneth Feature

In [12]:
def apply_oneth_feature(df, str_col, print_option=True):
    import math
    modify = np.vectorize(math.modf)
    oneth, tenth = modify(df[str_col] / 10)
    df[f'{str_col}_oneth'] = oneth * 10
    
    if print_option:
        print('Generated features: apply_oneth_feature')
        print(f"'{str_col}_oneth',")
        print()
        
    del tenth; gc.collect()

### nan Binary Features

In [13]:
def apply_isna_feature(df, subject_cols, print_option=True):
    binary_isna = [col+"_isnan" for col in subject_cols]
    df[binary_isna] = df[subject_cols].isna().astype(int)
    
    if print_option:
        print('Generated features: apply_oneth_feature')
        for col in binary_isna:
            print(f"'{col}',")
        print()
        
    del binary_isna; gc.collect()

### Row nan Count Feature

In [14]:
def apply_row_nan(df, print_option=True):
    df['row_nan'] = df.isna().sum(axis=1).astype(np.int8)
    
    if print_option:
        print('Generated features: apply_row_nan')
        print("'row_nan',")
        print()

### Bruteforce Combination

In [15]:
def bruteforce_combination(df, subject_cols, choose=2, print_option=True):
    from itertools import combinations
    comb = combinations(subject_cols, choose)
    for feat_1, feat_2 in comb:
        df[f'{feat_1}_.+_{feat_2}'] = df[f'{feat_1}'] + df[f'{feat_1}']
        df[f'{feat_1}_.-_{feat_2}'] = df[f'{feat_1}'] - df[f'{feat_1}']
        df[f'{feat_1}_.*_{feat_2}'] = df[f'{feat_1}'] * df[f'{feat_1}']
        df[f'{feat_1}_./_{feat_2}'] = df[f'{feat_1}'] / df[f'{feat_1}']
            
    if print_option:
        print('Generated features: bruteforce_feature_combination')
        for feat_1, feat_2 in comb:
            print(f"'{feat_1}_.+_{feat_2}',")
            print(f"'{feat_1}_.-_{feat_2}',")
            print(f"'{feat_1}_.*_{feat_2}',")
            print(f"'{feat_1}_./_{feat_2}',")
        print()
            
    del comb, feat_1, feat_2; gc.collect()

## Data Related

### Clipping

In [16]:
def apply_clip(df, str_col, pct_lower, pct_upper):
    LB, UB = np.percentile(df[str_col], [pct_lower, pct_upper])
    df[str_col] = np.clip(df[str_col], LB, UB)
    del LB, UB; gc.collect()

### Interpolation

In [17]:
def apply_interpolation(df, subject_cols, int_order, supp_median_fill=False):
    lin = lambda var: var.interpolate(method='linear', limit_direction='both')
    pol = lambda var: var.interpolate(method='polynomial', order=int_order, limit_direction='both')
    
    # ===== in ASHRAE, grouping was done via site_id =====
    # linear = df.groupby(grouping_col).apply(lin)
    # polyno = df.groupby(grouping_col).apply(pol)
    
    linear = df[subject_cols].apply(lin)
    polyno = df[subject_cols].apply(pol)
    df[subject_cols] = (linear[subject_cols] + polyno[subject_cols]) * 0.5
    
    # ===== if missing value remains: =====
    if supp_median_fill:
        #[col for col in cols if temp[col].isna().sum() > 0]
        for col in subject_cols:
            df[col].fillna(df[col].median(), inplace=True)
            del col
    del lin, pol, linear, polyno; gc.collect()

### Adversarial Validation with LightGBM

In [18]:
def advarsarial_validation_lightgbm(
    params,
    X_train,
    X_test,
    features,
    categorical=[],
    n_splits=5,
    shuffle=True,
    seed=42,
):
    X_train_adv = X_train.copy()
    X_test_adv = X_test.copy()
    
    X_train_adv['test'] = 0
    X_test_adv['test'] = 1
    
    X_train_adv = pd.concat([X_train_adv, X_test_adv], axis=0).reset_index(drop=True)
    y_train_adv = X_train_adv['test']
    X_train_adv = X_train_adv.drop('test', axis=1)
    
    printl(f'{X_train_adv.shape}, {y_train_adv.shape}, {len(features)}')
    
    cv = build_cv_spliter(X_train_adv,
                          y_train_adv,
                          strategy='stratified',
                          n_splits=n_splits,
                          shuffle=shuffle,
                          random_seed=seed)
    
    adv_metrics = {'AUC': roc_auc_score}
    _, adv, feature_importance_df = run_kfold_lightgbm(params,
                                                       X_train_adv,
                                                       y_train_adv,
                                                       X_train,
                                                       cv,
                                                       features,
                                                       adv_metrics,
                                                       categorical=cat_features)
    
    return adv, feature_importance_df

## Modelling Related

In [10]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

### Cross Validation Split

In [None]:
class MyGroupKFold:
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
    
    def split(self, X, y=None, groups=None):
        groups = pd.Series(groups)
        unique_groups = np.unique(groups)
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        for tr_group_idx, va_group_idx in kf.split(unique_groups):
            tr_groups, va_groups = unique_groups[tr_group_idx], unique_groups[va_group_idx]
            tr_indices = groups[groups.isin(tr_groups)].index.to_list()
            va_indices = groups[groups.isin(va_groups)].index.to_list()
            yield tr_indices, va_indices
            
class StratifiedGroupKFold:
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        
    # Implementation based on this kaggle kernel:
    #    https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
    def split(self, X, y=None, groups=None):
        k = self.n_splits
        rnd = check_random_state(self.random_state)
            
        # labels_num: zero-origin number of label
        # ex) unique = [0,1,2,3] -> labels_num = 4
        labels_num = np.max(y) + 1
        
        # y_counts_per_group: in-group label distribution
        # y_distr: whole label distribution
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        # y_counts_per_fold: in-fold label distribution
        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)
        
        # return mean std of per label counts when y_counts is in fold
        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std(
                    [y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]
                )
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
        
        # list of [group, y_counts]
        # if shuffle: fold changes in same np.std(y_counts)
        # ascending groups by degree of label variance
        groups_and_y_counts = list(y_counts_per_group.items())
        if self.shuffle:
            rnd.shuffle(groups_and_y_counts)
        groups_and_y_counts = sorted(groups_and_y_counts, key=lambda x: -np.std(x[1]))

        # set fold for each group such that label distirbution will be uniform
        for g, y_counts in groups_and_y_counts:
            best_fold = None
            min_eval = None
            for i in range(k):
                fold_eval = eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(k):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_indices = [i for i, g in enumerate(groups) if g in train_groups]
            test_indices = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_indices, test_indices

In [6]:
def build_cv_spliter(
    X_train,
    y_train,
    strategy='stratified',
    n_splits=5,
    group=None,
    shuffle=True,
    seed=8982,
    return_indices=False,
):
    if strategy == 'kfold':
        kf = KFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train)
    elif strategy == 'stratified':
        kf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train, y_train)
    elif strategy == 'group':
        kf = MyGroupKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train, y_train, group)
    elif strategy == 'stratified-group':
        kf = StratifiedGroupKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
        cv = kf.split(X_train, y_train, group)
    else:
        raise NotImplementedError(f'strategy {strategy} not implemented.')

    if not return_indices:
        cv_spliter = []
        for dev_idx, val_idx in cv:
            cv_spliter.append([dev_idx, val_idx])
        return cv_spliter
    else:
        fold_indices = np.zeros(len(X), dtype=np.int64)
        for fold, (_, val_idx) in enumerate(cv):
            fold_indices[val_idx] = int(fold)
        return fold_indices

### LightGBM (Binary Classification: max auc)

In [19]:
import lightgbm as lgb
# from sklearn.model_selection import StratifiedKFold

In [118]:
def lgb_kfold_clf(X_train, y_train, category_cols, split, bayes_opt=True,
                  learning_rate=0.05, num_leaves=31, max_depth=-1,
                  bagging_fraction=0.9, feature_fraction=0.9,
                  min_child_weight=1e-3, min_data_in_leaf=20,
                  lambda_l1=0.0, lambda_l2=0.0):
    metric='auc'
    params = {'objective': 'binary',
              'metric': metric,
              'boosting': 'gbdt',
              'seed': 8982,
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'bagging_freq': int(5),
              'bagging_fraction': bagging_fraction,
              'feature_fraction': feature_fraction,
              'min_child_weight': min_child_weight,   
              'min_data_in_leaf': int(min_data_in_leaf),
              'lambda_l1': lambda_l1,
              'lambda_l2': lambda_l2}
              #'verbosity': int(-1)}
             
    #cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
    #print(cat_features)
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    models = []; learning_curves = []; best_scores = []; valid_score = []
    feature_importance_df = pd.DataFrame()
    
    print(f'========== LightGBM Classifier training on : {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        d_train = lgb.Dataset(X_train.iloc[train_idx,:], label=y_train[train_idx], categorical_feature=category_cols)
        d_valid = lgb.Dataset(X_train.iloc[valid_idx,:], label=y_train[valid_idx], categorical_feature=category_cols)
        
        print(f'========== LightGBM Classifier training: {i+1}/{n_splits} fold ==========')
        learning_curve = {}
        model = lgb.train(params,
                          train_set=d_train,
                          valid_sets=[d_train, d_valid],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          evals_result=learning_curve,
                          verbose_eval=200#False,
                          )
        best_score = {f'train_{metric}': model.best_score['training'][f'{metric}'],
                      f'valid_{metric}': model.best_score['valid_1'][f'{metric}']}
        print()
        oofs[valid_idx] = model.predict(X_train.iloc[valid_idx,:], num_iteration=model.best_iteration)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_score.append(best_score[f'valid_{metric}'])
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        del d_train, d_valid, fold_importance_df
        gc.collect()
        
    valid_std_score = np.std(valid_score)
    valid_avg_score = np.mean(valid_score)
    print('====================')
    print(f'CV AVG: {metric} - {valid_avg_score}')
    print(f'CV STD: {metric} - {valid_std_score}')
    print('====================')

    if bayes_opt:
        return valid_avg_score
    else:
        return oofs, models, feature_importance_df #best_scores, learning_curves

def lgb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (20, 500), 
              #'max_depth': (-1, 250),
              'bagging_fraction' : (0.1, 1),
              'feature_fraction' : (0.1, 1),
              'min_child_weight': (0.001, 0.99),   
              'min_data_in_leaf': (3, 700),
              'lambda_l1': (0.1, 300), 
              'lambda_l2': (0.1, 300)}
    
    optimizer = BayesianOptimization(f=lgb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = optimizer.max['target']
    return param, cv

### LightGBM (Regression: min rmse)

In [125]:
import lightgbm as lgb
# from sklearn.model_selection import KFold

In [147]:
def lgb_kfold_reg(X_train, y_train, category_cols, split, bayes_opt=True,
                  learning_rate=0.05, num_leaves=31, max_depth=-1,
                  bagging_fraction=0.9, feature_fraction=0.9,
                  min_child_weight=1e-3, min_data_in_leaf=20,
                  lambda_l1=0.0, lambda_l2=0.0):
    metric='rmse'
    params = {'objective': 'regression',
              'metric': metric,
              'boosting': 'gbdt',
              'seed': 8982,
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'bagging_freq': int(5),
              'bagging_fraction': bagging_fraction,
              'feature_fraction': feature_fraction,
              'min_child_weight': min_child_weight,   
              'min_data_in_leaf': int(min_data_in_leaf),
              'lambda_l1': lambda_l1,
              'lambda_l2': lambda_l2}
              #'verbosity': int(-1)}
             
    #cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
    #print(cat_features)
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    models = []; learning_curves = []; valid_scores = []
    feature_importance_df = pd.DataFrame()
    
    print(f'========== LightGBM Regressor training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        d_train = lgb.Dataset(X_train.iloc[train_idx,:], label=y_train[train_idx], categorical_feature=category_cols)
        d_valid = lgb.Dataset(X_train.iloc[valid_idx,:], label=y_train[valid_idx], categorical_feature=category_cols)
        
        print(f'========== LightGBM Regressor training: {i+1}/{n_splits} fold ==========')
        learning_curve = {}
        model = lgb.train(params,
                          train_set=d_train,
                          valid_sets=[d_train, d_valid],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          evals_result=learning_curve,
                          verbose_eval=200#False,
                          )
        print()
        oofs[valid_idx] = model.predict(X_train.iloc[valid_idx,:], num_iteration=model.best_iteration)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_scores.append(model.best_score['valid_1'][f'{metric}'])
          
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)  
        
        del d_train, d_valid, fold_importance_df
        gc.collect()


    valid_std_score = np.std(valid_scores)
    valid_avg_score = np.mean(valid_scores)
    print('====================')
    print(f'CV AVG: {metric} - {valid_avg_score}')
    print(f'CV STD: {metric} - {valid_std_score}')
    print('====================')
    
    if bayes_opt:
        return -valid_avg_score
    else:
        return oofs, models, feature_importance_df #best_scores, learning_curves

def lgb_reg_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (20, 500),
              #'max_depth': (-1, 250),
              'bagging_fraction' : (0.1, 1),
              'feature_fraction' : (0.1, 1),
              'min_child_weight': (0.001, 0.99),   
              'min_data_in_leaf': (3, 700),
              'lambda_l1': (0.1, 300), 
              'lambda_l2': (0.1, 300)}
    
    optimizer = BayesianOptimization(f=lgb_kfold_reg, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', -optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = -optimizer.max['target']
    return param, cv

In [129]:
def lgb_pred(X_test, models):
    y_test_pred_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== LightGBM Predicting with {i+1}-th model ==========')
        y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
        y_test_pred_total += y_pred_test
    y_test_pred_total /= len(models)
    return y_test_pred_total

### CatBoost (Binary Classification: max AUC)

In [86]:
import catboost as cb

In [152]:
def cb_kfold_clf(X_train, y_train, category_cols, split, bayes_opt=True,
                learning_rate=0.03, num_leaves=31, max_depth=6,
                subsample=0.8, bagging_temperature=1.0, colsample_bylevel=1.0,
                min_data_in_leaf=1, l2_leaf_reg=3.0, random_strength=1.0):
    loss = 'Logloss'
    metric = 'AUC'
    params = {'loss_function': loss,
              'eval_metric': metric,
              'boosting_type': 'Plain',
              'random_seed': 8982,
              'num_boost_round': 5000,
              'early_stopping_rounds': 20,
              'use_best_model': True,
              # 'grow_policy': 'SymmetricTree','Depthwise','Lossguide',
              'nan_mode': 'Max',
              'od_type': 'Iter',
              'verbose': 200,
              
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'subsample': subsample, #bf?
              'bagging_temperature': bagging_temperature, #bf?
              'colsample_bylevel': colsample_bylevel, #ff
              'min_data_in_leaf': int(min_data_in_leaf),
              'l2_leaf_reg': l2_leaf_reg,
              'random_strength': random_strength}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    feature_importance_df = pd.DataFrame()
    
    models = []; learning_curves = []; valid_losses = []; valid_metrics = []
    print(f'========== CatBoost Classifier training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== CatBoost Classifier training: {i+1}/{n_splits} ==========')
        train_d = cb.Pool(data=X_train.loc[train_idx],
                          label=y_train[train_idx],
                          cat_features=category_cols)
        valid_d = cb.Pool(data=X_train.loc[valid_idx],
                          label=y_train[valid_idx],
                          cat_features=category_cols)
        
        model = cb.CatBoostClassifier(**params)
        model.fit(train_d, eval_set=valid_d)
        
        oofs[valid_idx] = model.predict_proba(X_train.loc[valid_idx])[:,1]
        models.append(model)
        valid_losses.append(model.best_score_['validation'][f'{loss}'])
        valid_metrics.append(model.best_score_['validation'][f'{metric}'])
          
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.get_feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
          
        del train_d, valid_d, model, fold_importance_df
        gc.collect()
        
    print('====================')
    print(f'CV AVG:\n{loss} - {np.mean(valid_losses)}\n{metric} - {np.mean(valid_metrics)}')
    print(f'CV STD:\n{loss} - {np.std(valid_losses)}\n{metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return np.mean(valid_metrics)
    else:
        return oofs, models, feature_importance_df

def cb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (16, 288), 
              'max_depth': (3, 16),
              'subsample' : (0.1, 1),
              'bagging_temperature' : (0, 100),
              'colsample_bylevel': (0.001, 1),   
              'min_data_in_leaf': (3, 700),
              'l2_leaf_reg': (0.1, 300), 
              'random_strength': (0, 100)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = optimizer.max['target']
    return param, cv

In [168]:
def cb_pred_clf(X_test, models):
    y_test_pred_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== CatBoost Predicting with {i+1}-th model ==========')
        y_pred_test = model.predict_proba(X_test)[:,1]
        y_test_pred_total += y_pred_test
    y_test_pred_total /= len(models)
    return y_test_pred_total

### CatBoost (Regression: min RMSE)

In [145]:
def cb_kfold_reg(X_train, y_train, category_cols, split, bayes_opt=True,
                learning_rate=0.03, num_leaves=31, max_depth=6,
                subsample=0.8, bagging_temperature=1.0, colsample_bylevel=1.0,
                min_data_in_leaf=1, l2_leaf_reg=3.0, random_strength=1.0):
    loss = 'RMSE'
    metric = 'RMSE'
    params = {'loss_function': loss,
              'eval_metric': metric,
              'boosting_type': 'Plain',
              'random_seed': 8982,
              'num_boost_round': 5000,
              'early_stopping_rounds': 20,
              'use_best_model': True,
              # 'grow_policy': 'SymmetricTree','Depthwise','Lossguide',
              'nan_mode': 'Max',
              'od_type': 'Iter',
              'verbose': 200,
              
              'learning_rate': learning_rate,
              'num_leaves': int(num_leaves),
              'max_depth': int(max_depth),
              'subsample': subsample, #bf?
              'bagging_temperature': bagging_temperature, #bf?
              'colsample_bylevel': colsample_bylevel, #ff
              'min_data_in_leaf': int(min_data_in_leaf),
              'l2_leaf_reg': l2_leaf_reg,
              'random_strength': random_strength}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    feature_importance_df = pd.DataFrame()
    
    models = []; valid_losses = []; valid_metrics = []
    print(f'========== CatBoost Regressor training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== CatBoost Regressor training: {i+1}/{n_splits} ==========')
        train_d = cb.Pool(data=X_train.loc[train_idx],
                          label=y_train[train_idx],
                          cat_features=category_cols)
        valid_d = cb.Pool(data=X_train.loc[valid_idx],
                          label=y_train[valid_idx],
                          cat_features=category_cols)
        
        model = cb.CatBoostRegressor(**params)
        model.fit(train_d, eval_set=valid_d)
        
        oofs[valid_idx] = model.predict(X_train.loc[valid_idx])
        models.append(model)
        # valid_losses.append(model.best_score_['validation'][f'{loss}'])
        valid_metrics.append(model.best_score_['validation'][f'{metric}'])
          
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = common_cols + category_cols
        fold_importance_df['importance'] = model.get_feature_importance()
        fold_importance_df['fold'] = i+1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
          
        del train_d, valid_d, model, fold_importance_df
        gc.collect()
    
    assert loss == metric
    print('====================')
    print(f'CV AVG: {metric} - {np.mean(valid_metrics)}')
    print(f'CV STD: {metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return -np.mean(valid_metrics)
    else:
        return oofs, models, feature_importance_df

def cb_reg_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'learning_rate': (0.001, 0.3),
              'num_leaves': (16, 288), 
              'max_depth': (3, 16),
              'subsample' : (0.1, 1),
              'bagging_temperature' : (0, 100),
              'colsample_bylevel': (0.001, 1),   
              'min_data_in_leaf': (3, 700),
              'l2_leaf_reg': (0.1, 300), 
              'random_strength': (0, 100)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', -optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = -optimizer.max['target']
    return param, cv

In [169]:
def cb_pred_reg(X_test, models):
    y_test_pred_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'========== CatBoost Predicting with {i+1}-th model ==========')
        y_pred_test = model.predict(X_test)#[:,1]
        y_test_pred_total += y_pred_test
    y_test_pred_total /= len(models)
    return y_test_pred_total

In [None]:
# is there no need to set cb.predict(, best_iteration=?)

### XGBoost (Binary Classification: max auc)

In [77]:
import xgboost as xgb
# encode categorical cols beforehand!

In [179]:
def xgb_kfold_clf(X_train, y_train, split, bayes_opt=True,
                  eta=0.3, gamma=0, max_depth=6, min_child_weight=1,
                  subsample=1, colsample_bytree=1.0, colsample_bylevel=1.0,
                  colsample_bynode=1.0, reg_lambda=1.0, reg_alpha=0.0):
    metric = 'auc'
    params = {'objective': 'binary:logistic',
              'eval_metric': metric,
              'booster': 'gbtree', # 'dart',
              'seed': 8982,

              'missing': np.nan,
              # when dart: 'rate_drop': (0.0, 1.0),
              # 'grow_policy': 'depthwise','lossguide',
              # 'verbosity': 1, #0
              # 'base_score': 0.5 <- initial leaf prediction
              
              'eta': eta,
              # 'gamma': gamma, # pruning phase of split in XGBoost unique tree
              'max_depth': int(max_depth),
              'min_child_weight': min_child_weight,
              'subsample': subsample, # 0.5 - randomly sample half of the training data prior to growing trees
              'colsample_bytree': colsample_bytree, # subsample ratio of columns when constructing each tree
              'colsample_bylevel': colsample_bylevel, # subsample ratio of columns for each level
              'colsample_bynode': colsample_bynode, # subsample ratio of columns for each node (split)
              'reg_lambda': reg_lambda,
              'reg_alpha': reg_alpha}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    
    models = []; learning_curves = []; valid_losses = []; valid_metrics = []
    print(f'========== XGBoost Classifier training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== XGBoost Classifier training: {i+1}/{n_splits} ==========')
        train_d = xgb.DMatrix(data=X_train.loc[train_idx],
                              label=y_train[train_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        valid_d = xgb.DMatrix(data=X_train.loc[valid_idx],
                              label=y_train[valid_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        learning_curve = {}
        model = xgb.train(params,
                          train_d,
                          evals=[(train_d, 'train'), (valid_d, 'valid')],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          verbose_eval=200,#False
                          evals_result=learning_curve)
        oofs[valid_idx] = model.predict(valid_d, ntree_limit=model.best_ntree_limit)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_metrics.append(model.best_score)
          
        del train_d, valid_d, model
        gc.collect()
        
    print('====================')
    print(f'CV AVG: {metric} - {np.mean(valid_metrics)}')
    print(f'CV STD: {metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return np.mean(valid_metrics)
    else:
        return oofs, models #, learning_curves

def cb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'eta': (0.001, 0.3),
              'max_depth': (3, 250),
              'min_child_weight': (0, 100),
              'subsample': (0.1, 1),
              'colsample_bytree': (0.1, 1),
              'colsample_bylevel': (0.1, 1),
              'colsample_bynode': (0.1, 1),
              'reg_lambda': (0, 300),
              'reg_alpha': (0, 300)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = optimizer.max['target']
    return param, cv

### XGBoost (Regression: max rmse)

In [192]:
def xgb_kfold_reg(X_train, y_train, split, bayes_opt=True,
                  eta=0.3, gamma=0, max_depth=6, min_child_weight=1,
                  subsample=1, colsample_bytree=1.0, colsample_bylevel=1.0,
                  colsample_bynode=1.0, reg_lambda=1.0, reg_alpha=0.0):
    metric = 'rmse'
    params = {'objective': 'reg:squarederror',
              'eval_metric': metric,
              'booster': 'gbtree', # 'dart',
              'seed': 8982,

              'missing': np.nan,
              # when dart: 'rate_drop': (0.0, 1.0),
              # 'grow_policy': 'depthwise','lossguide',
              # 'verbosity': 1, #0
              # 'base_score': 0.5 <- initial leaf prediction
              
              'eta': eta,
              # 'gamma': gamma, # pruning phase of split in XGBoost unique tree
              'max_depth': int(max_depth),
              'min_child_weight': min_child_weight,
              'subsample': subsample, # 0.5 - randomly sample half of the training data prior to growing trees
              'colsample_bytree': colsample_bytree, # subsample ratio of columns when constructing each tree
              'colsample_bylevel': colsample_bylevel, # subsample ratio of columns for each level
              'colsample_bynode': colsample_bynode, # subsample ratio of columns for each node (split)
              'reg_lambda': reg_lambda,
              'reg_alpha': reg_alpha}
    
    n_splits = len(split)
    oofs = np.zeros(X_train.shape[0])
    
    models = []; learning_curves = []; valid_losses = []; valid_metrics = []
    print(f'========== XGBoost Regressor training on: {X_train.shape} ==========')
    for i, (train_idx, valid_idx) in enumerate(split):
        print(f'========== XGBoost Regressor training: {i+1}/{n_splits} ==========')
        train_d = xgb.DMatrix(data=X_train.loc[train_idx],
                              label=y_train[train_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        valid_d = xgb.DMatrix(data=X_train.loc[valid_idx],
                              label=y_train[valid_idx],
                              feature_names=common_cols+category_cols)
                              #cat_features=category_cols)
        learning_curve = {}
        model = xgb.train(params,
                          train_d,
                          evals=[(train_d, 'train'), (valid_d, 'valid')],
                          num_boost_round=5000,
                          early_stopping_rounds=20,
                          verbose_eval=200,#False
                          evals_result=learning_curve)
        oofs[valid_idx] = model.predict(valid_d, ntree_limit=model.best_ntree_limit)
        models.append(model)
        learning_curves.append(learning_curve)
        valid_metrics.append(model.best_score)
          
        del train_d, valid_d, model
        gc.collect()
        
    print('====================')
    print(f'CV AVG: {metric} - {np.mean(valid_metrics)}')
    print(f'CV STD: {metric} - {np.std(valid_metrics)}')
    print('====================')
    
    if bayes_opt:
        return -np.mean(valid_metrics)
    else:
        return oofs, models #, learning_curves

def xgb_clf_bayes_opt(init_points=20, n_iteration=80):
    bounds = {'eta': (0.001, 0.3),
              'max_depth': (3, 250),
              'min_child_weight': (0, 100),
              'subsample': (0.1, 1),
              'colsample_bytree': (0.1, 1),
              'colsample_bylevel': (0.1, 1),
              'colsample_bynode': (0.1, 1),
              'reg_lambda': (0, 300),
              'reg_alpha': (0, 300)}
    
    optimizer = BayesianOptimization(f=cb_kfold_clf, pbounds=bounds, random_state=8982)
    optimizer.maximize(init_points=init_points, n_iter=n_iteration)
    
    print('Best score:', -optimizer.max['target'])
    print('Best set of parameters:')
    print(optimizer.max['params'])
    param = optimizer.max['params']; cv = -optimizer.max['target']
    return param, cv

In [183]:
def xgb_pred(X_test, models):
    y_pred_test_total = np.zeros(X_test.shape[0])
    test_d = xgb.DMatrix(X_test)
    for i, model in enumerate(models):
        print(f'========== XGBoost Predicting with {i+1}-th model ==========')
        y_pred_test = model.predict(test_d, ntree_limit=model.best_ntree_limit)
        y_pred_test_total += y_pred_test
    y_pred_test_total /= len(models)
    return y_pred_test_total

### Neural Net

In [25]:
import tensorflow as tf
import tensorflow.keras as keras
# import torch

In [26]:
def build_neuralnet(
    recipe,
    loss='mse',
    optimizer='adam',
    lr=1e-3,
    monitor='val_loss',
    es_patience=-1,
    restore_best_weights=True,
    lr_scheduler='none',
    lr_factor=0.1,
    lr_patience=5,
    seed=42,
    **_,
):
    tf.random.set_seed(seed)
    model = keras.models.model_from_json(recipe)
    
    if loss == 'mse':
        loss = keras.losses.mean_squared_error
    elif loss == 'bce':
        loss = keras.losses.binary_crossentropy
    else:
        raise NotImplementedError
    
    if optimizer == 'adam':
        optimizer = keras.optimizers.Adam(lr)
    else:
        raise NotImplementedError
    
    model.compile(optimizer=optimizer, loss=loss)
    
    callbacks = []
    
    if es_patience >= 0:
        es = keras.callbacks.EarlyStopping(monitor=monitor,
                                           patience=es_patience,
                                           restore_best_weights=restore_best_weights,
                                           verbose=1)
        callbacks.append(es)
    
    if lr_scheduler == 'none':
        pass
    elif lr_scheduler == 'reduce_on_plateau':
        lr_sche = keras.callbacks.ReduceLROnPlateau(monitor=monitor,
                                                    factor=lr_factor,
                                                    patience=lr_patience,
                                                    verbose=1)
        callbacks.append(lr_sche)
    else:
        raise NotImplementedError
    
    return model, callbacks


def train_neuralnet(
    params,
    X_train,
    y_train,
    validation_data=None,
):
    model, callbacks = build_neuralnet(**params)
    model.fit(X_train,
              y_train,
              validation_data=validation_data,
              batch_size=params['batch_size'],
              epochs=params['epochs'],
              callbacks=callbacks)
    return model


def run_kfold_neuralnet(
    params,
    X_train,
    y_train,
    X_test,
    cv,
    features,
    metrics,
):
    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_test))
    
    n_splits = len(cv)
    printl(f"k={n_splits} folds neuralnet running...")
    printl(f"train data/feature shape: {X_train[features].shape}")
    
    for fold, (dev_idx, val_idx) in enumerate(cv):
        validation_data = [X_train.loc[val_idx, features], y_train[val_idx]]
        model = train_neuralnet(params,
                                X_train.loc[dev_idx, features],
                                y_train[dev_idx],
                                validation_data=validation_data)
        
        oof[val_idx] = model.predict(X_train.loc[val_idx, features].values)[:,0]
        predictions += model.predict(X_test[features].values)[:,0] / n_splits
        
        msg = f'fold: {fold}'
        for name, func in metrics.items():
            score = func(y_train[val_idx], oof[val_idx])
            msg += f' - {name}: {score:.5f}'
        printl(msg)
    
    msg = f'CV score'
    for name, func in metrics.items():
        score = func(y_train, oof)
        msg += f' - {name}: {score:.5f}'
    printl(msg)

    return oof, predictions

## Feature Selection

### Plot Feature Importance

In [27]:
def plot_feature_importance(
    feature_importance_df,
    feature_name='feature',
    importance_name=['split', 'gain'],
    top_k=50,
    fig_width=16,
    fig_height=8,
    fontsize=14,
):
    if isinstance(importance_name, str):
        importance_name = [importance_name]
    
    num_importance = len(importance_name)
    plt.figure(figsize=(fig_width, fig_height*num_importance))
    gs = gridspec.GridSpec(1, num_importance)
    
    def _fetch_best_features(df, fimp='gain'):
        cols = (df[[feature_name, fimp]]
                .groupby(feature_name)
                .mean()
                .sort_values(by=fimp, ascending=False)
                .index
                .values[:top_k])
        return cols, df.loc[df[feature_name].isin(cols)]
    
    for i, fimp in enumerate(importance_name):
        cols, best_features = _fetch_best_features(feature_importance_df, fimp)
        ax = plt.subplot(gs[0, i])
        sns.barplot(x=fimp, y=feature_name, data=best_features, order=cols, ax=ax)
        title = f'Features {fimp} importance (averaged/folds)'
        plt.title(title, fontweight='bold', fontsize=fontsize)
    
    plt.tight_layout()

# or this
# fold_importance_df.plot.barh(x='feature', y='gain', figsize=(13,20))

### Iterative Feature Elimination by LightGBM

In [28]:
class Iterative_CV:
    def __init__(self, X_train_full, y_train, eval_cols, metric):
        self.X_train_full = X_train_full
        self.y_train = y_train
        self.eval_cols = eval_cols
        self.metric = metric
    
    # =====
    # eliminates/imputes one feature at a time,
    # returns list with options and discards
    # ====
    
    def iter_cv_elim():
        excl_improve = []; excl_worse = []
        if self.metric == 'rmse':
            init_valid_avg_score = -1 * lgb_kfold(self.X_train_full, self.y_train, bayes_opt=True)
            print(f'[Iter_Feature_Elim] Current best score is {init_valid_avg_score}')
            for cols in tqdm(self.eval_cols):
                temp_cols = list(set(self.X_train_full.columns) - {col})
                X_train = self.X_train_full[temp_cols]
                new_valid_avg_score = -1 * lgb_kfold(X_train, self.y_train, bayes_opt=True)
                degree = new_valid_avg_score - init_valid_avg_score
                if degree < 0:
                    pct = 100 * (-1 * degree / init_valid_avg_score)
                    excl_improve.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion improved (lowered) avg CV by {pct}pct.")
                else:
                    pct = 100 * (degree / init_valid_avg_score)
                    excl_worse.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion worsened (raised) avg CV by {pct}pct.")
        elif self.metric == 'auc':
            init_valid_avg_score = lgb_skfold(self.X_train_full, self.y_train, bayes_opt=True)
            print(f'[Iter_Feature_Elim] Current best score is {init_valid_avg_score}')
            for col in tqdm(self.eval_cols):
                temp_cols = list(set(self.X_train_full.columns) - {col})
                X_train = self.X_train_full[temp_cols]
                new_valid_avg_score = lgb_skfold(X_train, self.y_train, bayes_opt=True)
                degree = new_valid_avg_score - init_valid_avg_score
                if degree > 0:
                    pct = 100 * (degree / init_valid_avg_score)
                    excl_improve.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion improved (raised) avg CV by {pct}pct.")
                else:
                    pct = 100 * (-1 * degree / init_valid_avg_score)
                    excl_worse.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion worsened (lowered) avg CV by {pct}pct.")

        excl_improve.sort(key=lambda lst: lst[1])
        excl_worse.sort(key=lambda lst: lst[1])
        del init_valid_avg_score, cols, temp_cols, X_train, new_valid_avg_score, degree, pct
        gc.collect()
        return excl_improve, excl_worse
    
    def iter_cv_rank():
        impt = []
        if self.metric == 'rmse':
            for col in tqdm(self.eval_cols):
                X_train = self.X_train_full[col]
                assert X_train.shape[1] == 1
                print(f"[Iter_Feature_Rank] '{col}', evaluation ongoing.")
                valid_avg_score = -1 * lgb_kfold(X_train, self.y_train, bayes_opt=True)
                impt.append([col, valid_avg_score])
        elif self.metric == 'auc':
            for col in tqdm(self.eval_cols):
                X_train = self.X_train_full[col]
                assert X_train.shape[1] == 1
                print(f"[Iter_Feature_Rank] '{col}', evaluation ongoing.")
                valid_avg_score = lgb_skfold(X_train, self.y_train, bayes_opt=True)
                impt.append([col, valid_avg_score])
        impt.sort(key=lambda lst: lst[1])
        del col, X_train, valid_avg_score
        gc.collect()
        return impt

### Null Importance Selection

In [29]:
def _get_lgb_fimp(
    params,
    X_train,
    y_train,
    features,
    shuffle,
    seed=42,
    categorical=[]
):
    # Shuffle target if required
    y = y_train.copy()
    if shuffle:
        random.seed(seed)
        np.random.seed(seed)
        y = y_train.copy().sample(frac=1.0)
    
    arg_categorical = categorical if len(categorical) > 0 else 'auto'
    dtrain = lgb.Dataset(X_train[features],
                         label=y.values,
                         categorical_feature=arg_categorical)
    
    # Fit the model
    clf = lgb.train(params, dtrain)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df['feature'] = features
    imp_df['split'] = clf.feature_importance(importance_type='split')
    imp_df['gain'] = clf.feature_importance(importance_type='gain')
    
    return imp_df


def null_importance_selection(
    params,
    X_train,
    y_train,
    features,
    seed=42,
    categorical=[],
    num_actual_run=1,
    num_null_run=40,
    eps=1e-10,
    valid_percentile=75,
):
    actual_imp_df = pd.DataFrame()
    
    np.random.seed(seed)
    for i in tqdm(range(num_actual_run)):
        seed = np.random.randint(1000)
        imp_df = _get_lgb_fimp(params,
                               X_train,
                               y_train,
                               features,
                               shuffle=False,
                               seed=seed,
                               categorical=categorical)
        imp_df['run'] = i
        actual_imp_df = pd.concat([actual_imp_df, imp_df], axis=0)
    
    null_imp_df = pd.DataFrame()
    
    np.random.seed(seed)
    for i in tqdm(range(num_null_run)):
        seed = np.random.randint(1000)
        imp_df = _get_lgb_fimp(params,
                               X_train,
                               y_train,
                               features,
                               shuffle=True,
                               seed=seed,
                               categorical=categorical)
        imp_df['run'] = i
        null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    
    feature_scores = []
    
    for _f in actual_imp_df['feature'].unique():
        # importance gain of gain
        act_fimp_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'split'].mean()
        null_fimp_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'split'].values
        split_score = np.log(eps + act_fimp_split / (1 + np.percentile(null_fimp_split, valid_percentile)))
        
        # importance gain of gain
        act_fimp_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'gain'].mean()
        null_fimp_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'gain'].values
        gain_score = np.log(eps + act_fimp_gain / (1 + np.percentile(null_fimp_gain, valid_percentile)))

        feature_scores.append((_f, split_score, gain_score))
    
    scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])
    return scores_df

### Consider Multicollinearity

In [30]:
def extract_high_corr_columns(df, threshold=0.99, verbose=True):
    df_corr = abs(df.corr())
    delete_columns = []
    
    # diagonal values filled by zero
    for i in range(0, len(df_corr.columns)):
        df_corr.iloc[i, i] = 0
    
    # loop as removing high-correlated columns in df_corr
    while True:
        df_max_column_value = df_corr.max()
        max_corr = df_max_column_value.max()
        query_column = df_max_column_value.idxmax()
        target_column = df_corr[query_column].idxmax()
        
        if max_corr < threshold:
            break
        else:
            # drop feature which is highly correlated with others 
            if sum(df_corr[query_column]) <= sum(df_corr[target_column]):
                delete_column = target_column
                saved_column = query_column
            else:
                delete_column = query_column
                saved_column = target_column
            
            df_corr.drop([delete_column], axis=0, inplace=True)
            df_corr.drop([delete_column], axis=1, inplace=True)
            delete_columns.append(delete_column)
            
            if verbose:
                printl('{}: Drop: {} <- Query: {}, Corr: {:.5f}'.format(
                    len(delete_columns), delete_column, saved_column, max_corr
                ))

    return delete_columns