In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
import seaborn as sns
import datetime
import scipy.stats
import warnings
import gc
from colorama import Fore, Back, Style

from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay
from lightgbm import LGBMClassifier, log_evaluation
from catboost import CatBoostRegressor

import optuna
from optuna.samplers import TPESampler


INFERENCE = True # set to False if you only want to cross-validate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%%time
features_avg = ['B_1', 'B_11', 'B_16', 'B_17', 'B_18', 'B_2', 'B_20',
                'B_28', 'B_3', 'B_4', 'B_5', 'B_7', 'B_9', 'D_112',
                'D_121', 'D_141', 'D_39', 'D_41', 'D_42', 'D_43',
                'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 
                'D_50', 'D_51', 'D_53', 'D_54', 'D_56', 'D_58', 
                'D_59', 'D_60', 'D_91', 'P_2', 'P_3', 'R_1', 'R_2', 
                'R_27', 'R_3', 'R_7', 'S_11', 'S_26', 'S_3', 'S_5']
features_last = ['B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_15', 'B_16',
                 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_22', 'B_23',
                 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3',
                 'B_32', 'B_33', 'B_36', 'B_38', 'B_39', 'B_4', 'B_40',
                 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9',
                 'D_102', 'D_103', 'D_105', 'D_106', 'D_107', 'D_109',
                 'D_112', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120',
                 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 
                 'D_129', 'D_132', 'D_133', 'D_135', 'D_136', 'D_137', 
                 'D_140', 'D_141', 'D_143', 'D_145', 'D_39', 'D_41',
                 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48',
                 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55',
                 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63',
                 'D_64', 'D_66', 'D_70', 'D_72', 'D_73', 'D_74', 'D_75',
                 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_82', 'D_83',
                 'D_84', 'D_86', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96',
                 'P_2', 'P_3', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13',
                 'R_14', 'R_15', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 
                 'R_21', 'R_22', 'R_24', 'R_25', 'R_26', 'R_27', 'R_3',
                 'R_4', 'R_5', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12',
                 'S_13', 'S_15', 'S_17', 'S_20', 'S_22', 'S_23', 
                 'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6',
                 'S_7', 'S_8', 'S_9']
                 

train_test = [None, None] # first element is train, second element is test
for i in [1, 0] if INFERENCE else [0]:
    train_test[i] = pd.read_feather(['dataset/train_data.ftr',
                                     'dataset/test_data.ftr'][i])
    cid = pd.Categorical(train_test[i].pop('customer_ID'), ordered=True)
    last = (cid != np.roll(cid, -1)) # mask for last statement of every customer
    if i == 0: # train
        target = train_test[0].loc[last, 'target']
    gc.collect()
    print('Read', i)
    df_avg = (train_test[i][features_avg]
              .groupby(cid)
              .mean()
              .rename(columns={f: f"{f}_avg" for f in features_avg})
             )
    gc.collect()
    print('Computed avg', i)
    train_test[i] = (train_test[i].loc[last, features_last]
                     .rename(columns={f: f"{f}_last" for f in features_last})
                     .set_index(np.asarray(cid[last]))
                    )
    gc.collect()
    print('Computed last', i)
    train_test[i] = pd.concat([train_test[i], df_avg], axis=1)
    del df_avg, cid, last

train, test = tuple(train_test)
del train_test
if INFERENCE: print('Shapes:', train.shape, target.shape, test.shape)

Read 1
Computed avg 1
Computed last 1
Read 0
Computed avg 0
Computed last 0
Shapes: (458913, 197) (458913,) (924621, 197)
CPU times: user 31.4 s, sys: 11.2 s, total: 42.6 s
Wall time: 21 s


In [5]:
def amex_metric(y_true, y_pred, return_components=False) -> float:
    """Amex metric for ndarrays"""
    def top_four_percent_captured(df) -> float:
        """Corresponds to the recall for a threshold of 4 %"""
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        """Corresponds to 2 * AUC - 1"""
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)

    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)

    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(y_true, y_pred),
            True)

In [6]:
%%time

ONLY_FIRST_FOLD = False

features = [f for f in train.columns if f != 'customer_ID' and f != 'target']

def my_booster():
    params = {
            'learning_rate': 0.02410649590217162, 
            'n_estimators': 1332, 
            'max_depth': 16, 
            'min_child_samples': 3551, 
            'max_bin': 582, 
            'num_leaves': 929, 
            'random_state':42,
            'feature_fraction':0.3,
            'bagging_fraction':0.3
            }
    
    return LGBMClassifier(**params)

print(f'{len(features)} features**')

score_list = []
y_pred_list = []

kf = StratifiedKFold(n_splits=5)

for fold, (idx_tr, idx_va) in enumerate(kf.split(train, target)):
    start_time = datetime.datetime.now()
    X_tr = train.iloc[idx_tr][features]
    X_va = train.iloc[idx_va][features]
    y_tr = target.iloc[idx_tr]
    y_va = target.iloc[idx_va]

    model = my_booster()

    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_tr, y_tr,
                eval_set=[(X_tr, y_tr), (X_va, y_va)],
                eval_metric=[lgb_amex_metric],
                callbacks=[log_evaluation(100)]) ##
    y_va_pred = model.predict_proba(X_va)[:, 1]
    score = amex_metric(y_va.values, y_va_pred)
    n_trees = model.best_iteration_

    if n_trees is None:
        n_trees = model.n_estimators
        
    print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f" {n_trees:5} trees |"
          f"                Score = {score:.5f}{Style.RESET_ALL}")
    score_list.append(score)
    
    if INFERENCE:
        y_pred_list.append(model.predict_proba(test[features])[:,1])
        
    if ONLY_FIRST_FOLD: break # we only want the first fold
    
print(f"{Fore.GREEN}{Style.BRIGHT}OOF Score:                       {np.mean(score_list):.5f}{Style.RESET_ALL}")

197 features**
[100]	training's binary_logloss: 0.252131	training's amex: 0.768481	valid_1's binary_logloss: 0.256963	valid_1's amex: 0.761043
[200]	training's binary_logloss: 0.223666	training's amex: 0.786201	valid_1's binary_logloss: 0.232191	valid_1's amex: 0.775342
[300]	training's binary_logloss: 0.2137	training's amex: 0.798824	valid_1's binary_logloss: 0.225941	valid_1's amex: 0.782492
[400]	training's binary_logloss: 0.207649	training's amex: 0.807022	valid_1's binary_logloss: 0.223329	valid_1's amex: 0.786702
[500]	training's binary_logloss: 0.202914	training's amex: 0.813967	valid_1's binary_logloss: 0.221952	valid_1's amex: 0.788029
[600]	training's binary_logloss: 0.198764	training's amex: 0.820531	valid_1's binary_logloss: 0.221152	valid_1's amex: 0.78903
[700]	training's binary_logloss: 0.195027	training's amex: 0.826572	valid_1's binary_logloss: 0.220631	valid_1's amex: 0.789325
[800]	training's binary_logloss: 0.191415	training's amex: 0.832154	valid_1's binary_logloss

In [23]:
def objective(trial):
  param = {
      "random_state":42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.05),
      "n_estimators":trial.suggest_int("n_estimators", 500, 2500),
      "max_depth":trial.suggest_int("max_depth", 6, 16),
      "min_child_samples": trial.suggest_int("min_child_samples", 2000, 4000),
      "max_bin": trial.suggest_int("max_bin", 300, 600),
      'num_leaves': trial.suggest_int("num_leaves", 127, 1023),
      'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
  }


  lgbm = LGBMClassifier(**param)
  lgbm.fit(X_tr, y_tr,
                eval_set=[(X_tr, y_tr), (X_va, y_va)],
                eval_metric=[lgb_amex_metric],
                callbacks=[log_evaluation(100)])
  preds = lgbm.predict(X_va)
  score = amex_metric(y_va, preds)

  return score

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'lgbm_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)

study.optimize(objective, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

Inference

In [7]:
if INFERENCE:
    sub = pd.DataFrame({'customer_ID': test.index,
                        'prediction': np.mean(y_pred_list, axis=0)})
    sub.to_csv('results/submission.csv', index=False)