# exp41

lag_diffのcatboost


https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7977

In [4]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import lightgbm as lgb
import xgboost as xgb


In [5]:

# ====================================================
# Get the difference
# ====================================================
def get_difference(data, num_features):
    df1 = []
    customer_ids = []
    for customer_id, df in tqdm(data.groupby(['customer_ID'])):
        # Get the differences
        diff_df1 = df[num_features].diff(1).iloc[[-1]].values.astype(np.float32)
        # Append to lists
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    # Concatenate
    df1 = np.concatenate(df1, axis = 0)
    # Transform to dataframe
    df1 = pd.DataFrame(df1, columns = [col + '_diff1' for col in df[num_features].columns])
    # Add customer id
    df1['customer_ID'] = customer_ids
    return df1

# ====================================================
# Read & preprocess data and save it to disk
# ====================================================
def read_preprocess_data():
    train = pd.read_parquet('/content/data/train.parquet')
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineer...')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
    # Transform float64 columns to float32
    cols = list(train_num_agg.dtypes[train_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        train_num_agg[col] = train_num_agg[col].astype(np.float32)
    # Transform int64 columns to int32
    cols = list(train_cat_agg.dtypes[train_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        train_cat_agg[col] = train_cat_agg[col].astype(np.int32)
    # Get the difference
    train_diff = get_difference(train, num_features)
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_diff, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
    del train_num_agg, train_cat_agg, train_diff
    gc.collect()
    test = pd.read_parquet('../input/amex-fe/test_fe.parquet')
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    # Transform float64 columns to float32
    cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        test_num_agg[col] = test_num_agg[col].astype(np.float32)
    # Transform int64 columns to int32
    cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        test_cat_agg[col] = test_cat_agg[col].astype(np.int32)
    # Get the difference
    test_diff = get_difference(test, num_features)
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(test_diff, how = 'inner', on = 'customer_ID')
    del test_num_agg, test_cat_agg, test_diff
    gc.collect()
    # Save files to disk
    train.to_parquet('../input/amex-fe/train_fe.parquet')
    test.to_parquet('../input/amex-fe/test_fe.parquet')

# Read & Preprocess Data
read_preprocess_data()

FileNotFoundError: [Errno 2] No such file or directory: '/content/data/train.parquet'

# Training & Inference

In [6]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from itertools import combinations

import pickle

# ====================================================
# Configurations
# ====================================================
class CFG:
    
    
    # input_dir = '../feature/exp35_lagdiff/'
    input_dir = '../feature/exp03_amex-fe/'
    output_dir = '../output/exp47_optuna/'
    seed = 42
    n_folds = 5
    target = 'target'
    boosting_type = 'dart'
    metric = 'binary_logloss'
    model = "cat"
    ver = "exp47"

# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# ====================================================
# Read data
# ====================================================
# def read_data():
#     train = pd.read_parquet(CFG.input_dir + 'train_diff.parquet')
#     test = pd.read_parquet(CFG.input_dir + 'test_diff.parquet')
#     return train, test

def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_fe_plus_plus.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_fe_plus_plus.parquet')
    return train, test

# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

# ====================================================
# LGBM amex metric
# ====================================================
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True


In [7]:
seed_everything(CFG.seed)

train = pd.read_parquet('../feature/exp38_lagdiff_c3/train_lagdiff_c3.parquet')
test = pd.read_parquet('../feature/exp38_lagdiff_c3/test_lagdiff_c3.parquet')

# train = pd.read_parquet('../feature/exp35_lagdiff/train_lagdiff.parquet')
# test = pd.read_parquet('../feature/exp35_lagdiff/test_lagdiff.parquet')

# # train, test = read_data()

# train_c3 = pd.read_pickle('../feature/exp18_4_tsfresh/train_c3.pkl')
# test_c3 = pd.read_pickle('../feature/exp18_4_tsfresh/test_c3.pkl')

# train = train.merge(train_c3,on = "customer_ID",how = "left")
# test = test.merge(test_c3,on = "customer_ID",how = "left")

# del train_c3,test_c3
# gc.collect

# print(train.shape)
# print(test.shape)

In [8]:

def xgb_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())


def cat_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())


def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)

    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)

    g = gini / gini_max
    return 0.5 * (g + d)

In [None]:

from catboost import CatBoostClassifier

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]

# kmeans_list = ["kmeans pred 2","kmeans pred 3","kmeans pred 4"]

cat_features = [f"{cf}_last" for cf in cat_features]
# cat_features.extend(kmeans_list)

for cat_col in cat_features:
#     print(cat_col)
    encoder = LabelEncoder()
    train[cat_col] = encoder.fit_transform(train[cat_col])
    test[cat_col] = encoder.transform(test[cat_col])


features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]

prams = {
    'depth': 10,#8
    'iterations':9999,#9999
    'learning_rate': 0.02,
    'random_state':CFG.seed,
    'task_type':"CPU",
    # 'min_child_samples':13,
    'early_stopping_rounds': 300,
    # 'custom_metric' : 'cat_amex'
}

# Create a numpy array to store test predictions
test_predictions = np.zeros(len(test))
# Create a numpy array to store out of folds predictions
oof_predictions = []

cids = []
tr_target = []

kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {len(features)} features...')
    x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
    y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
#     lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
#     lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
    
#     des = DartEarlyStopping("valid_1", CFG.metric, 1000)
    
    model = CatBoostClassifier(**prams)
    
    model.fit(x_train, y_train,
                  eval_set = [(x_val, y_val)], 
                  metric_period=100
                 )
    
#     model = lgb.train(
#         params = params,
#         train_set = lgb_train,
#         num_boost_round = epoch[fold],#10500
#         valid_sets = [lgb_train, lgb_valid],
#         early_stopping_rounds = 1500,
# #         eval_metric=[lgb_amex_metric],
#         verbose_eval = 500,
#         feval = lgb_amex_metric
#         )
    
    # Save best model
    model.save_model(f"{CFG.output_dir}{CFG.model}_fold{fold}_seed{CFG.seed}.cbm")
    joblib.dump(model, f'{CFG.output_dir}lgbm_{CFG.boosting_type}_fold{fold}_seed{CFG.seed}.pkl')
    # Predict validation
    
    val_pred = model.predict_proba(x_val)[:,1]
    oof_predictions.extend(val_pred)
    
    
    cids.extend(train["customer_ID"].loc[val_ind])
    tr_target.extend(train["target"].loc[val_ind])
    
    # Predict the test set
    test_pred = model.predict_proba(test[features])[:,1]
    # test_pred = model.predict(test[features])
    test_predictions += test_pred / CFG.n_folds
    # Compute fold metric
    
    score = amex_metric(y_val, val_pred)
    print(f'Our fold {fold} CV score is {score}')
    del x_train, x_val, y_train, y_val
    gc.collect()
    
# Compute out of folds metric
score = amex_metric(tr_target, oof_predictions)
print(f'Our out of folds CV score is {score}')


# Create a dataframe to store test prediction
test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
# test_df.to_csv(f'{CFG.output_dir}test_{CFG.model}_{score}_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
test_df.to_csv(f'{CFG.output_dir}test_{CFG.ver}_{CFG.model}_{score}_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)


dic_oof = {
    "customer_ID":cids,
    "target":tr_target,
    f"{CFG.ver}_{CFG.model}_oof":oof_predictions
}

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame(dic_oof)
# oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
oof_df.to_csv(f'{CFG.output_dir}oof_{CFG.ver}_{CFG.model}_{score}_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)


# # Create a dataframe to store out of folds predictions
# oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
# oof_df.to_csv(f'../output/Amex LGBM Dart CV 0.7977/oof_lgbm_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)


# # Create a dataframe to store test prediction
# test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
# test_df.to_csv(f'../output/Amex LGBM Dart CV 0.7977/test_lgbm_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)


 
--------------------------------------------------
Training fold 0 with 2011 features...




0:	learn: 0.6664775	test: 0.6663839	best: 0.6663839 (0)	total: 1.17s	remaining: 3h 15m 37s
100:	learn: 0.2408595	test: 0.2414693	best: 0.2414693 (100)	total: 1m 43s	remaining: 2h 49m 27s
200:	learn: 0.2259198	test: 0.2283421	best: 0.2283421 (200)	total: 3m 24s	remaining: 2h 45m 53s
300:	learn: 0.2197401	test: 0.2241464	best: 0.2241464 (300)	total: 5m 4s	remaining: 2h 43m 26s
400:	learn: 0.2153980	test: 0.2218409	best: 0.2218409 (400)	total: 6m 44s	remaining: 2h 41m 15s
500:	learn: 0.2118675	test: 0.2204066	best: 0.2204066 (500)	total: 8m 24s	remaining: 2h 39m 23s
600:	learn: 0.2087189	test: 0.2193640	best: 0.2193640 (600)	total: 10m 4s	remaining: 2h 37m 28s
700:	learn: 0.2055474	test: 0.2185133	best: 0.2185133 (700)	total: 11m 43s	remaining: 2h 35m 36s
800:	learn: 0.2025189	test: 0.2179143	best: 0.2179143 (800)	total: 13m 21s	remaining: 2h 33m 25s
900:	learn: 0.1995967	test: 0.2174438	best: 0.2174432 (899)	total: 14m 58s	remaining: 2h 31m 13s
1000:	learn: 0.1968630	test: 0.2171285	best



0:	learn: 0.6669185	test: 0.6668947	best: 0.6668947 (0)	total: 1.05s	remaining: 2h 55m 57s
100:	learn: 0.2401012	test: 0.2427124	best: 0.2427124 (100)	total: 1m 43s	remaining: 2h 49m 44s
200:	learn: 0.2251243	test: 0.2304355	best: 0.2304355 (200)	total: 3m 23s	remaining: 2h 45m 29s
300:	learn: 0.2189112	test: 0.2264949	best: 0.2264949 (300)	total: 5m 4s	remaining: 2h 43m 23s
400:	learn: 0.2145766	test: 0.2244442	best: 0.2244442 (400)	total: 6m 44s	remaining: 2h 41m 18s
500:	learn: 0.2110451	test: 0.2230531	best: 0.2230531 (500)	total: 8m 25s	remaining: 2h 39m 37s
600:	learn: 0.2079252	test: 0.2220687	best: 0.2220687 (600)	total: 10m 3s	remaining: 2h 37m 21s


In [None]:
oof_df[f"{CFG.ver}_{CFG.model}_oof"].hist()

In [19]:
len(oof_df)

458913