In [1]:
import os
import torch

class Config:
    AUTHOR = "colum2131"

    NAME = "NFLC-" + "lgbm+xgb"

    COMPETITION = "nfl-player-contact-detection"

    seed = 42
    num_fold = 5
    
    lgbm_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate':0.03,
        'lambda_l1': 2.757654517864576e-06, 
        'lambda_l2': 0.018135558360332416, 
        'num_leaves': 254, 
        'feature_fraction': 0.9083150639158681, 
        'bagging_fraction': 0.7563425196831307, 
        'bagging_freq': 5, 
        'min_child_samples': 33
    }
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.03,
        'verbosity': 2,
        'booster': 'gbtree', 
        'lambda': 2.2456853514324213e-05, 
        'alpha': 0.010167492143658599, 
        'max_depth': 10, 
        'eta': 0.3119573893909086, 
        'gamma': 3.858135609025019e-05, 
        'colsample_bytree': 0.4158915520852815, 
        'colsample_bylevel': 0.605320595063157, 
        'subsample': 0.9302938053468902, 
        'min_child_weight': 0.1204339344286855,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }

In [2]:
import os
import gc
import subprocess

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from IPython.display import Video, display

from scipy.optimize import minimize
import cv2
from glob import glob
from tqdm import tqdm

from sklearn.model_selection import GroupKFold
from sklearn.metrics import (
    roc_auc_score,
    matthews_corrcoef,
)

import xgboost as xgb
import lightgbm as lgbm

import torch

if torch.cuda.is_available():
    import cupy 
    import cudf
    from cuml import ForestInference

In [3]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.INPUT = f'../input/{cfg.COMPETITION}'
    cfg.EXP = cfg.NAME
    cfg.OUTPUT_EXP = cfg.NAME
    cfg.SUBMISSION = './'
    cfg.DATASET = '../input/'

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    return cfg

In [4]:
# ==============================
# function
# ==============================
# ref: https://www.kaggle.com/code/robikscube/nfl-player-contact-detection-getting-started
def add_contact_id(df):
    # Create contact ids
    df["contact_id"] = (
        df["game_play"]
        + "_"
        + df["step"].astype("str")
        + "_"
        + df["nfl_player_id_1"].astype("str")
        + "_"
        + df["nfl_player_id_2"].astype("str")
    )
    return df

def expand_contact_id(df):
    """
    Splits out contact_id into seperate columns.
    """
    df["game_play"] = df["contact_id"].str[:12]
    df["step"] = df["contact_id"].str.split("_").str[-3].astype("int")
    df["nfl_player_id_1"] = df["contact_id"].str.split("_").str[-2]
    df["nfl_player_id_2"] = df["contact_id"].str.split("_").str[-1]
    return df

# cross validation
def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

# lgbm code
def fit_lgbm(cfg, X, y, params, add_suffix=''):

    oof_pred = np.zeros(len(y), dtype=np.float32)
    for fold in sorted(cfg.folds.unique()):
        if fold == -1: continue
        idx_train = (cfg.folds!=fold)
        idx_valid = (cfg.folds==fold)
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        display(pd.Series(y_valid).value_counts())

        lgbm_train = lgbm.Dataset(x_train, label=y_train)
        lgbm_valid = lgbm.Dataset(x_valid, label=y_valid)
        evals = [lgbm_train, lgbm_valid]

        model = lgbm.train(
            params,
            lgbm_train,
            num_boost_round=10_000,
            early_stopping_rounds=100,
            valid_sets=evals,
            verbose_eval=100,
        )

        model_path = os.path.join(cfg.EXP_MODEL, f'lgbm_fold{fold}{add_suffix}.model')
        model.save_model(model_path)
        if not torch.cuda.is_available():
            model = lgbm.Booster(model_file=model_path)
        else:
            model = ForestInference.load(model_path, output_class=True, model_type='lightgbm')
        pred_i = model.predict(x_valid)
        oof_pred[x_valid.index] = pred_i
        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred{add_suffix}'), oof_pred)
    score = round(roc_auc_score(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    return oof_pred

def pred_lgbm(X, data_dir, add_suffix=''):
    models = glob(os.path.join(data_dir, f'lgbm_fold*{add_suffix}.model'))
    models = [lgbm.Booster(model_file=model_path) for model_path in models]
    preds = np.array([model.predict(X) for model in models])
    preds = np.mean(preds, axis=0)
    return preds

# xgboost code
def fit_xgboost(cfg, X, y, params, add_suffix=''):
    """
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.01,
        'tree_method':'gpu_hist'
    }
    """
    oof_pred = np.zeros(len(y), dtype=np.float32)
    for fold in sorted(cfg.folds.unique()):
        if fold == -1: continue
        idx_train = (cfg.folds!=fold)
        idx_valid = (cfg.folds==fold)
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        display(pd.Series(y_valid).value_counts())

        xgb_train = xgb.DMatrix(x_train, label=y_train)
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)
        evals = [(xgb_train,'train'),(xgb_valid,'eval')]

        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=10_000,
            early_stopping_rounds=100,
            evals=evals,
            verbose_eval=100,
        )

        model_path = os.path.join(cfg.EXP_MODEL, f'xgb_fold{fold}{add_suffix}.model')
        model.save_model(model_path)
        if not torch.cuda.is_available():
            model = xgb.Booster().load_model(model_path)
        else:
            model = ForestInference.load(model_path, output_class=True, model_type='xgboost')
        pred_i = model.predict_proba(x_valid)[:, 1]
        oof_pred[x_valid.index] = pred_i
        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred{add_suffix}'), oof_pred)
    score = round(roc_auc_score(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    return oof_pred

def pred_xgboost(X, data_dir, add_suffix=''):
    models = glob(os.path.join(data_dir, f'xgb_fold*{add_suffix}.model'))
    if not torch.cuda.is_available():
         models = [xgb.Booster().load_model(model_path) for model in models]
    else:
        models = [ForestInference.load(model, output_class=True, model_type='xgboost') for model in models]
    preds = np.array([model.predict_proba(X)[:, 1] for model in models])
    preds = np.mean(preds, axis=0)
    return preds

In [5]:
# ==============================
# read data
# ==============================
cfg = setup(Config)

if not torch.cuda.is_available():
    tr_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # tr_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    # te_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # tr_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    # te_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = pd.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = expand_contact_id(sub)
    
else:
    tr_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # tr_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    # te_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # tr_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    # te_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = cudf.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = cudf.DataFrame(expand_contact_id(sub))

The following code is used to create the features.  
Basically, the numerical features contained in player_tracking.csv are merged into player_id_1 and player_id_2 respectively.

In [6]:
# ==============================
# feature engineering
# ==============================
def create_features(df, tr_tracking, merge_col="step", use_cols=["x_position", "y_position"]):
    output_cols = []
    df_combo = (
        df.astype({"nfl_player_id_1": "str"})
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id",] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .rename(columns={c: c+"_1" for c in use_cols})
        .drop("nfl_player_id", axis=1)
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id"] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .drop("nfl_player_id", axis=1)
        .rename(columns={c: c+"_2" for c in use_cols})
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    output_cols += [c+"_1" for c in use_cols]
    output_cols += [c+"_2" for c in use_cols]
    
    if ("x_position" in use_cols) & ("y_position" in use_cols):
        index = df_combo['x_position_2'].notnull()
        if torch.cuda.is_available():
            index = index.to_array()
        distance_arr = np.full(len(index), np.nan)
        tmp_distance_arr = np.sqrt(
            np.square(df_combo.loc[index, "x_position_1"] - df_combo.loc[index, "x_position_2"])
            + np.square(df_combo.loc[index, "y_position_1"]- df_combo.loc[index, "y_position_2"])
        )
        if torch.cuda.is_available():
            tmp_distance_arr = tmp_distance_arr.to_array()
        distance_arr[index] = tmp_distance_arr
        df_combo['distance'] = distance_arr
        output_cols += ["distance"]
        
    df_combo['G_flug'] = (df_combo['nfl_player_id_2']=="G")
    output_cols += ["G_flug"]
    return df_combo, output_cols


use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]
train, feature_cols = create_features(train, tr_tracking, use_cols=use_cols)
test, feature_cols = create_features(test, te_tracking, use_cols=use_cols)
if torch.cuda.is_available():
    train = train.to_pandas()
    test = test.to_pandas()

display(train)

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact,x_position_1,y_position_1,speed_1,...,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance,G_flug
0,58168_003392_0_37084_37211,58168_003392,2020-09-11 03:01:48.100,0,37084,37211,0,41.90,20.08,0.54,...,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,3.794232,False
1,58168_003392_0_37084_38556,58168_003392,2020-09-11 03:01:48.100,0,37084,38556,0,41.90,20.08,0.54,...,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,10.530043,False
2,58168_003392_0_37084_38567,58168_003392,2020-09-11 03:01:48.100,0,37084,38567,0,41.90,20.08,0.54,...,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,1.543017,False
3,58168_003392_0_37084_38590,58168_003392,2020-09-11 03:01:48.100,0,37084,38590,0,41.90,20.08,0.54,...,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,5.431841,False
4,58168_003392_0_37084_39947,58168_003392,2020-09-11 03:01:48.100,0,37084,39947,0,41.90,20.08,0.54,...,40.11,26.73,0.99,0.09,163.38,90.69,1.68,1.64,6.886697,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4721613,58582_003121_91_52500_52619,58582_003121,2021-10-12 02:42:29.100,91,52500,52619,0,58.74,40.11,1.34,...,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,18.020710,False
4721614,58582_003121_91_52500_G,58582_003121,2021-10-12 02:42:29.100,91,52500,G,0,58.74,40.11,1.34,...,,,,,,,,,,True
4721615,58582_003121_91_52609_52619,58582_003121,2021-10-12 02:42:29.100,91,52609,52619,0,60.32,25.93,1.38,...,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,4.094142,False
4721616,58582_003121_91_52609_G,58582_003121,2021-10-12 02:42:29.100,91,52609,G,0,60.32,25.93,1.38,...,,,,,,,,,,True


# Exclude distance > 2
if the distance between two players is greater than 2 then the probability of contact is so low, we will consider it = 0, training data will be reduced from 4.7 M rows to 660 K

In [7]:
DISTANCE_THRESH = 2

train_y = train['contact'].values
oof_pred = np.zeros(len(train))
cond_dis_train = (train['distance']<=DISTANCE_THRESH) | (train['distance'].isna())
cond_dis_test = (test['distance']<=DISTANCE_THRESH) | (test['distance'].isna())

train = train[cond_dis_train]
train.reset_index(inplace = True, drop = True)

print('number of train data : ',len(train))

_ = gc.collect()

number of train data :  660560


# Helmet track Features

In [8]:
CLUSTERS = [10, 50, 100, 500]

def add_step_pct(df, cluster):
    df['step_pct'] = cluster * (df['step']-min(df['step']))/(max(df['step'])-min(df['step']))
    df['step_pct'] = df['step_pct'].apply(np.ceil).astype(np.int32)
    return df

for cluster in CLUSTERS:
    train = train.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
    test = test.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))

    for helmet_view in ['Sideline', 'Endzone']:
        helmet_train = pd.read_csv('/kaggle/input/nfl-player-contact-detection/train_baseline_helmets.csv')
        helmet_train.loc[helmet_train['view']=='Endzone2','view'] = 'Endzone'
        helmet_test = pd.read_csv('/kaggle/input/nfl-player-contact-detection/test_baseline_helmets.csv')
        helmet_test.loc[helmet_test['view']=='Endzone2','view'] = 'Endzone'

        helmet_train.rename(columns = {'frame': 'step'}, inplace = True)
        helmet_train = helmet_train.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
        helmet_test.rename(columns = {'frame': 'step'}, inplace = True)
        helmet_test = helmet_test.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
        helmet_train = helmet_train[helmet_train['view']==helmet_view]
        helmet_test = helmet_test[helmet_test['view']==helmet_view]

        helmet_train['helmet_id'] = helmet_train['game_play'] + '_' + helmet_train['nfl_player_id'].astype(str) + '_' + helmet_train['step_pct'].astype(str)
        helmet_test['helmet_id'] = helmet_test['game_play'] + '_' + helmet_test['nfl_player_id'].astype(str) + '_' + helmet_test['step_pct'].astype(str)

        helmet_train = helmet_train[['helmet_id', 'left', 'width', 'top', 'height']].groupby('helmet_id').mean().reset_index()
        helmet_test = helmet_test[['helmet_id', 'left', 'width', 'top', 'height']].groupby('helmet_id').mean().reset_index()
        for player_ind in [1, 2]:
            train['helmet_id'] = train['game_play'] + '_' + train['nfl_player_id_'+str(player_ind)].astype(str) + \
                                    '_' + train['step_pct'].astype(str)
            test['helmet_id'] = test['game_play'] + '_' + test['nfl_player_id_'+str(player_ind)].astype(str) + \
                                    '_' + test['step_pct'].astype(str)

            train = train.merge(helmet_train, how = 'left')
            test = test.merge(helmet_test, how = 'left')

            train.rename(columns = {i:i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']}, inplace = True)
            test.rename(columns = {i:i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']}, inplace = True)

            del train['helmet_id'], test['helmet_id']
            gc.collect()

            feature_cols += [i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']]
        del helmet_train, helmet_test
        gc.collect()

# Fill missing values for the ground

In [9]:
# 이 코드는 충돌 판정 시, 양쪽 선수의 위치 정보를 비슷하게 만들어주는 역할을 합니다.
for cluster in CLUSTERS:
    for helmet_view in ['Sideline', 'Endzone']:
        train.loc[train['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_2'] = train.loc[train['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_1']
        train.loc[train['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_2'] = train.loc[train['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_1']
        train.loc[train['G_flug']==True,'width_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        train.loc[train['G_flug']==True,'height_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        
        test.loc[test['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_2'] = test.loc[test['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_1']
        test.loc[test['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_2'] = test.loc[test['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_1']
        test.loc[test['G_flug']==True,'width_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        test.loc[test['G_flug']==True,'height_'+helmet_view+'_'+str(cluster)+'_2'] = 0

# Diffrence & Product features

In [10]:
cols = [i[:-2] for i in train.columns if i[-2:]=='_1' and i!='nfl_player_id_1']
train[[i+'_diff' for i in cols]] = np.abs(train[[i+'_1' for i in cols]].values - train[[i+'_2' for i in cols]].values)
test[[i+'_diff' for i in cols]] = np.abs(test[[i+'_1' for i in cols]].values - test[[i+'_2' for i in cols]].values)
feature_cols += [i+'_diff' for i in cols]

cols = ['x_position', 'y_position', 'speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa']
train[[i+'_prod' for i in cols]] = train[[i+'_1' for i in cols]].values * train[[i+'_2' for i in cols]].values
test[[i+'_prod' for i in cols]] = test[[i+'_1' for i in cols]].values * test[[i+'_2' for i in cols]].values
feature_cols += [i+'_prod' for i in cols]

print('number of features : ',len(feature_cols))
print('number of train data : ',len(train))

number of features :  130
number of train data :  660560


# Train & Infer XGBoost model

In [11]:
# ==============================
# training & inference
# ==============================

cfg.folds = get_groupkfold(train, 'contact', 'game_play', cfg.num_fold)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'), index=False)

oof_pred[np.where(cond_dis_train)] = fit_lgbm(cfg, train[feature_cols], train['contact'], 
                                              cfg.lgbm_params, add_suffix="_lgbm_1st")
np.save('oof_pred_1.npy',oof_pred)
sub_pred_1 = pred_lgbm(test.loc[cond_dis_test, feature_cols], cfg.EXP_MODEL, add_suffix="_lgbm_1st")

0    119036
1     13030
Name: contact, dtype: int64



[LightGBM] [Info] Number of positive: 51282, number of negative: 477212
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30655
[LightGBM] [Info] Number of data points in the train set: 528494, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097034 -> initscore=-2.230621
[LightGBM] [Info] Start training from score -2.230621
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.981544	valid_1's auc: 0.950197
[200]	training's auc: 0.993653	valid_1's auc: 0.953455
[300]	training's auc: 0.997476	valid_1's auc: 0.954101
[400]	training's auc: 0.998981	valid_1's auc: 0.954428
[500]	training's auc: 0.999586	valid_1's auc: 0.954599
[600]	training's auc: 0.999838	valid_1's auc: 0.954467
Early stopping, best iteration is:
[533]	training's auc: 0.999705	valid_1's auc: 0.954646
[W] [04:25:20.494230] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative t

0    119555
1     12629
Name: contact, dtype: int64



[LightGBM] [Info] Number of positive: 51683, number of negative: 476693
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30764
[LightGBM] [Info] Number of data points in the train set: 528376, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097815 -> initscore=-2.221744
[LightGBM] [Info] Start training from score -2.221744
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.982432	valid_1's auc: 0.943806
[200]	training's auc: 0.993948	valid_1's auc: 0.945622
[300]	training's auc: 0.997717	valid_1's auc: 0.946061
[400]	training's auc: 0.999059	valid_1's auc: 0.946186
Early stopping, best iteration is:
[390]	training's auc: 0.998956	valid_1's auc: 0.946226
[W] [04:33:46.787271] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.
[W] [04:33:46.913346] Casting all thresholds and leaf values to float32, as FIL 

0    118545
1     13620
Name: contact, dtype: int64



[LightGBM] [Info] Number of positive: 50692, number of negative: 477703
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30792
[LightGBM] [Info] Number of data points in the train set: 528395, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.095936 -> initscore=-2.243221
[LightGBM] [Info] Start training from score -2.243221
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.983315	valid_1's auc: 0.940882
[200]	training's auc: 0.994312	valid_1's auc: 0.944081
[300]	training's auc: 0.997859	valid_1's auc: 0.944812
[400]	training's auc: 0.999176	valid_1's auc: 0.944941
Early stopping, best iteration is:
[330]	training's auc: 0.998395	valid_1's auc: 0.945012
[W] [04:41:20.898839] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.
[W] [04:41:21.004896] Casting all thresholds and leaf values to float32, as FIL 

0    119615
1     12566
Name: contact, dtype: int64



[LightGBM] [Info] Number of positive: 51746, number of negative: 476633
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30766
[LightGBM] [Info] Number of data points in the train set: 528379, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097933 -> initscore=-2.220400
[LightGBM] [Info] Start training from score -2.220400
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.982568	valid_1's auc: 0.939131
[200]	training's auc: 0.99393	valid_1's auc: 0.941419
[300]	training's auc: 0.997727	valid_1's auc: 0.942166
[400]	training's auc: 0.999109	valid_1's auc: 0.942388
Early stopping, best iteration is:
[381]	training's auc: 0.998925	valid_1's auc: 0.942673
[W] [04:49:41.432430] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.
[W] [04:49:41.556245] Casting all thresholds and leaf values to float32, as FIL c

0    119497
1     12467
Name: contact, dtype: int64



[LightGBM] [Info] Number of positive: 51845, number of negative: 476751
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30793
[LightGBM] [Info] Number of data points in the train set: 528596, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.098081 -> initscore=-2.218736
[LightGBM] [Info] Start training from score -2.218736
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.98137	valid_1's auc: 0.943994
[200]	training's auc: 0.993658	valid_1's auc: 0.946329
[300]	training's auc: 0.997527	valid_1's auc: 0.946945
[400]	training's auc: 0.998946	valid_1's auc: 0.946849
Early stopping, best iteration is:
[335]	training's auc: 0.998175	valid_1's auc: 0.947024
[W] [04:57:14.215978] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.
[W] [04:57:14.331593] Casting all thresholds and leaf values to float32, as FIL c

In [None]:
def func(x_list):
    score = matthews_corrcoef(train_y, oof_pred>x_list[0])
    return -score

x0 = [0.5]
result = minimize(func, x0,  method="nelder-mead")
cfg.threshold = result.x[0]
print("score:", round(matthews_corrcoef(train_y, oof_pred>cfg.threshold), 5))
print("threshold", round(cfg.threshold, 5))

sub_pred_1 = (sub_pred_1 > cfg.threshold).astype(int)
# sub_pred_1이 6631개의 0,1 값으로 변경되는지 확인
sub_pred_1

In [12]:
cfg.folds = get_groupkfold(train, 'contact', 'game_play', cfg.num_fold)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'), index=False)

oof_pred[np.where(cond_dis_train)] = fit_xgboost(cfg, train[feature_cols], train['contact'], 
                                              cfg.xgb_params, add_suffix="_xgb_1st")
np.save('oof_pred_2.npy',oof_pred)
sub_pred_2 = pred_xgboost(test.loc[cond_dis_test, feature_cols], cfg.EXP_MODEL, add_suffix="_xgb_1st")

0    119036
1     13030
Name: contact, dtype: int64

[0]	train-auc:0.91499	eval-auc:0.89229
[100]	train-auc:0.98444	eval-auc:0.95081
[200]	train-auc:0.99375	eval-auc:0.95395
[300]	train-auc:0.99633	eval-auc:0.95436
[400]	train-auc:0.99798	eval-auc:0.95460
[500]	train-auc:0.99896	eval-auc:0.95459
[583]	train-auc:0.99944	eval-auc:0.95435
Performance of the prediction: 0.95435



0    119555
1     12629
Name: contact, dtype: int64

[0]	train-auc:0.92022	eval-auc:0.88591
[100]	train-auc:0.98505	eval-auc:0.94480
[200]	train-auc:0.99358	eval-auc:0.94739
[300]	train-auc:0.99638	eval-auc:0.94676
[303]	train-auc:0.99643	eval-auc:0.94675
Performance of the prediction: 0.94675



0    118545
1     13620
Name: contact, dtype: int64

[0]	train-auc:0.91852	eval-auc:0.86941
[100]	train-auc:0.98545	eval-auc:0.94066
[200]	train-auc:0.99434	eval-auc:0.94386
[300]	train-auc:0.99673	eval-auc:0.94368
[327]	train-auc:0.99719	eval-auc:0.94361
Performance of the prediction: 0.94359



0    119615
1     12566
Name: contact, dtype: int64

[0]	train-auc:0.92153	eval-auc:0.86664
[100]	train-auc:0.98582	eval-auc:0.93505
[200]	train-auc:0.99430	eval-auc:0.93970
[300]	train-auc:0.99692	eval-auc:0.94094
[400]	train-auc:0.99847	eval-auc:0.94131
[500]	train-auc:0.99921	eval-auc:0.94123
[527]	train-auc:0.99935	eval-auc:0.94119
Performance of the prediction: 0.94118



0    119497
1     12467
Name: contact, dtype: int64

[0]	train-auc:0.92063	eval-auc:0.87953
[100]	train-auc:0.98496	eval-auc:0.94008
[200]	train-auc:0.99361	eval-auc:0.94378
[300]	train-auc:0.99647	eval-auc:0.94438
[370]	train-auc:0.99769	eval-auc:0.94387
Performance of the prediction: 0.94387

All Performance of the prediction: 0.94582


In [None]:
x0 = [0.5]
result = minimize(func, x0,  method="nelder-mead")
cfg.threshold = result.x[0]
print("score:", round(matthews_corrcoef(train_y, oof_pred>cfg.threshold), 5))   
print("threshold", round(cfg.threshold, 5))

sub_pred_2 = (sub_pred_2 > cfg.threshold).astype(int)
# sub_pred_1이 6631개의 0,1 값으로 변경되는지 확인
sub_pred_2

In [None]:
train = train.fillna(0)

from sklearn.ensemble import  StackingClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb  
import xgboost as xgb
import catboost as cb
from catboost import CatBoostClassifier
# from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# 데이터 로드
data, target = train[feature_cols], train['contact']
train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)

score = 0
models = []


base_models = [('rf_1', lgb.LGBMClassifier(
                            objective = 'binary',
                            learning_rate = 0.03, 
                            lambda_l1 = 2.757654517864576e-06,
#                             lambda_l2 = 0.018135558360332416,
                            num_leaves = 254,
                            feature_fraction = 0.9083150639158681,
                            bagging_fraction = 0.7563425196831307,
                            bagging_freq = 5,
                            min_child_samples = 33
                     )),
               
               ('rf_2', cb.CatBoostClassifier(
                            loss_function = 'Logloss',
                            eval_metric = 'Accuracy',
                            verbose = False,
                            depth = 9,
                            learning_rate = 0.09349732050796461,
                            l2_leaf_reg = 0.0026112334190675508, 
                            bagging_temperature = 0.004346654343182356,
                            random_strength = 0.0002090724949890479, 
                            border_count = 99
                     ))]

    # stacking 설정
model = StackingClassifier(estimators=base_models, final_estimator=xgb.XGBClassifier(max_depth=3,
                                                                                     objective = 'binary:logistic',
                                                                                     learning_rate= 0.03,
                                                                                     n_estimators= 50,
                                                                                     reg_alpha = 0.5,
#                                        nthread = -1,
                                                                                     min_child_weight=3,
                                                                                     gamma=0.5,
                                                                                     subsample=0.5
#                                                                                      colsample_bytree=0.5
                                                                                    ))

model.fit(train_x, train_y)
models.append(model)

pred = model.predict(valid_x)
pred_i = model.predict_proba(valid_x)[:, 1]

score = roc_auc_score(valid_y,(pred > 0.3).astype(int))

In [None]:
del train
gc.collect()

In [None]:
preds = np.array([model.predict_proba(test.loc[cond_dis_test, feature_cols])[:, 1] for model in models])
preds = np.mean(preds, axis=0)
preds = (preds > cfg.threshold).astype(int)
preds

In [13]:
# 예측값 배열 생성
predictions = np.array([sub_pred_1, sub_pred_2, preds])

# 하드보팅 적용하여 예측
hard_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=0, arr=predictions)



array([0.02025622, 0.03848793, 0.01408726, ..., 0.01222922, 0.03796682,
       0.09217735])

# Submission

In [14]:
test = add_contact_id(test)
test['contact'] = 0
# test.loc[cond_dis_test, 'contact'] = hard_vote.astype(int)
test.loc[cond_dis_test, 'contact'] = hard_vote
test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
display(test[['contact_id', 'contact']].head())

Unnamed: 0,contact_id,contact
0,58168_003392_0_37084_37211,0
1,58168_003392_0_37084_38556,0
2,58168_003392_0_37084_38567,0
3,58168_003392_0_37084_38590,0
4,58168_003392_0_37084_39947,0
