In [None]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from itertools import combinations

In [None]:
# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

# ====================================================
# LGBM amex metric
# ====================================================
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

# **Train LightGBM model**

In [None]:
# Load train dataset
train = pd.read_parquet('C:\\Users\\16122\\AMEX Kaggle Competition\\train_newnn_fe.parquet')

In [None]:
train.head()

In [None]:
cols = [col for col in list(train.columns) if col not in ['customer_ID','S_2']]

In [None]:
for col in cols:
    if train[col].dtype == 'int64':
        train[col] = train[col].astype('int8')

In [None]:
# Label encode categorical features
cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
cat_features = [f"{cf}_last" for cf in cat_features]
for cat_col in cat_features:
    encoder = LabelEncoder()
    train[cat_col] = encoder.fit_transform(train[cat_col])

In [None]:
# Round numerical features to two decimal places
for col in list(train.columns):
    if train[col].dtype == 'float32':
        train[col] = train[col].round(2)

In [None]:
train.head()

In [None]:
# ====================================================
# Train & Evaluate
# ====================================================
def train_and_evaluate(train, features, cat_features):
    params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': 42,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.3,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 5,
        'min_data_in_leaf': 40
        }
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    # CV and start training
    kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train[features], train['target'])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train['target'].iloc[trn_ind], train['target'].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 9500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 500,
            feval = lgb_amex_metric,)
        # Save model
        joblib.dump(model, f'lgbm_fold{fold}_seed{42}_l2_5_fe{len(features)}_round2_9500_new.pkl')
        # Predict validation
        val_pred = model.predict(x_val)
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
    # Compute out of folds metric
    score = amex_metric(train['target'], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to save predictions of each validation set
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train['target'], 'prediction': oof_predictions})
    oof_df.to_csv(f'oof_lgbm_baseline_{5}fold_seed{42}_l2_5_fe{len(features)}_round2_9500_new.csv', index = False)

In [None]:
%%time
train_and_evaluate(train, features, cat)

In [None]:
del train
gc.collect()

# **Feature selection**

I did not select features in my final submission. The main goal to compute feature importance is to see whether we can come out some new ideas to create new features. 

In [None]:
fold = 5
features = [col for col in train.columns if col not in ['customer_ID', 'target']]
imp_df = pd.DataFrame({'Feature': train[features].columns})
ave = 0
for i in range(fold):
    model = joblib.load(f'lgbm_fold{i}_seed42_l2_5_fe1258_round2_9500_P2B4diff.pkl')
    imp_df[f'fea_imp{i}'] = model.feature_importance()
    ave += model.feature_importance()
    imp_df[f'overall average'] = ave/fold

In [None]:
imp_df.sort_values(by=['overall average'], ascending=False, inplace=True)

In [None]:
imp_df[imp_df['overall average'] > 0].Feature.to_list()

In [None]:
features = imp_df[imp_df['overall average'] > 0].Feature.to_list()

# **Prediction and submission**

In [None]:
# Load test dataset
test = pd.read_parquet("C:\\Users\\16122\\AMEX Kaggle Competition\\test_newnn_fe.parquet")

In [None]:
# Label encode categorical features
cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
cat_features = [f"{cf}_last" for cf in cat_features]
for cat_col in cat_features:
    encoder = LabelEncoder()
    test[cat_col] = encoder.fit_transform(test[cat_col])

In [None]:
# Round numerical features to two decimal places
for col in list(test.columns):
    if test[col].dtype == 'float32':
        test[col] = test[col].round(2)

In [None]:
# ====================================================
# Predict & Test
# ====================================================
def predict_test(test, features):
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    for fold in range(5):
        # Predict the test set
        print(f'=========== Fold {fold} is predicting ===========')
        model = joblib.load(f'lgbm_fold{fold}_seed{42}_l2_5_fe{len(features)}_round2_selected_10500.pkl')
        test_pred = model.predict(test[features])
        test_predictions += test_pred / 5
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(f'test_lgbm_{5}folds_seed{42}_l2_5_fe{len(features)}_round2_selected_10500.csv', index = False)

In [None]:
predict_test(test)

In [None]:
del test
gc.collect()