In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import catboost

import gc
from tqdm import tqdm_notebook
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, GroupKFold
tqdm.pandas()

In [2]:
import random
import os 
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
root_path = "/data/project/recsys-challenge-2023/sharechat_recsys2023_data/"
train_file = os.path.join(root_path, "train/train.parquet")
test_file = os.path.join(root_path, "test/test.parquet")

train_df = pd.read_parquet(train_file)
test_df = pd.read_parquet(test_file)

In [5]:
trn_idx = train_df[train_df["f_1"] != 66].index
val_idx = train_df[train_df["f_1"] == 66].index

valid_df = train_df.iloc[val_idx]
train_df = train_df.iloc[trn_idx]

In [6]:
print(train_df.shape, valid_df.shape)

(3387880, 83) (97972, 83)


In [7]:
# f_7 제외해봐야함 
features = [
    c for c in train_df.columns if c not in [
        'is_clicked', 'is_installed', 'file_name', 'f_0', 'target'
    ]
]
print(features)

['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8', 'f_9', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_27', 'f_28', 'f_29', 'f_30', 'f_31', 'f_32', 'f_33', 'f_34', 'f_35', 'f_36', 'f_37', 'f_38', 'f_39', 'f_40', 'f_41', 'f_42', 'f_43', 'f_44', 'f_45', 'f_46', 'f_47', 'f_48', 'f_49', 'f_50', 'f_51', 'f_52', 'f_53', 'f_54', 'f_55', 'f_56', 'f_57', 'f_58', 'f_59', 'f_60', 'f_61', 'f_62', 'f_63', 'f_64', 'f_65', 'f_66', 'f_67', 'f_68', 'f_69', 'f_70', 'f_71', 'f_72', 'f_73', 'f_74', 'f_75', 'f_76', 'f_77', 'f_78', 'f_79']


## Adversarial Validation 
- Train <-> Validation 간의 Gap을 최대화 시키는 변수 확인 

In [8]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
lgb_params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc', # log_loss
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [13]:
_valid_df = valid_df.copy()
_train_df = train_df.copy()

_train_df["target"] = 0
_valid_df["target"] = 1

total_df = pd.concat([_valid_df, _train_df], axis=0)

In [15]:
adversarial_scores = []
for col in tqdm(features):
    print(f"col : {col}")
    X_train, X_valid, y_train, y_valid = train_test_split(total_df[[col]], total_df["target"], test_size=0.5, random_state=42, shuffle=True)
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(lgb_params, trn_data, 50, valid_sets = [trn_data, val_data], verbose_eval=-1, early_stopping_rounds=100)
    adversarial_scores += [clf.best_score["valid_1"]["auc"]]
    print("=="*50)

  0%|          | 0/79 [00:00<?, ?it/s]

col : f_1
Training until validation scores don't improve for 100 rounds


  1%|▏         | 1/79 [00:05<07:38,  5.88s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 1	valid_1's auc: 1
col : f_2
Training until validation scores don't improve for 100 rounds


  3%|▎         | 2/79 [00:15<10:05,  7.86s/it]

Did not meet early stopping. Best iteration is:
[34]	training's auc: 0.696495	valid_1's auc: 0.696297
col : f_3
Training until validation scores don't improve for 100 rounds


  4%|▍         | 3/79 [00:22<09:25,  7.44s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.554632	valid_1's auc: 0.553555
col : f_4
Training until validation scores don't improve for 100 rounds


  5%|▌         | 4/79 [00:32<10:46,  8.61s/it]

Did not meet early stopping. Best iteration is:
[49]	training's auc: 0.728869	valid_1's auc: 0.728949
col : f_5
Training until validation scores don't improve for 100 rounds


  6%|▋         | 5/79 [00:39<09:56,  8.07s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.516743	valid_1's auc: 0.517278
col : f_6
Training until validation scores don't improve for 100 rounds


  8%|▊         | 6/79 [00:51<11:31,  9.48s/it]

Did not meet early stopping. Best iteration is:
[50]	training's auc: 0.74342	valid_1's auc: 0.74437
col : f_7
Training until validation scores don't improve for 100 rounds


  9%|▉         | 7/79 [00:55<09:18,  7.75s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
col : f_8
Training until validation scores don't improve for 100 rounds


 10%|█         | 8/79 [01:03<08:54,  7.52s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.505119	valid_1's auc: 0.505474
col : f_9
Training until validation scores don't improve for 100 rounds


 11%|█▏        | 9/79 [01:10<08:51,  7.59s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.928483	valid_1's auc: 0.928695
col : f_10
Training until validation scores don't improve for 100 rounds


 13%|█▎        | 10/79 [01:16<08:12,  7.13s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.519629	valid_1's auc: 0.521813
col : f_11
Training until validation scores don't improve for 100 rounds


 14%|█▍        | 11/79 [01:24<08:23,  7.41s/it]

Did not meet early stopping. Best iteration is:
[24]	training's auc: 0.643703	valid_1's auc: 0.642431
col : f_12
Training until validation scores don't improve for 100 rounds


 15%|█▌        | 12/79 [01:31<08:09,  7.31s/it]

Did not meet early stopping. Best iteration is:
[34]	training's auc: 0.532316	valid_1's auc: 0.534467
col : f_13
Training until validation scores don't improve for 100 rounds


 16%|█▋        | 13/79 [01:40<08:24,  7.64s/it]

Did not meet early stopping. Best iteration is:
[46]	training's auc: 0.510581	valid_1's auc: 0.508496
col : f_14
Training until validation scores don't improve for 100 rounds


 18%|█▊        | 14/79 [01:47<08:03,  7.44s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5143	valid_1's auc: 0.513804
col : f_15
Training until validation scores don't improve for 100 rounds


 19%|█▉        | 15/79 [02:00<09:43,  9.11s/it]

Did not meet early stopping. Best iteration is:
[49]	training's auc: 0.521333	valid_1's auc: 0.506083
col : f_16
Training until validation scores don't improve for 100 rounds


 20%|██        | 16/79 [02:07<08:54,  8.48s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.608544	valid_1's auc: 0.606923
col : f_17
Training until validation scores don't improve for 100 rounds


 22%|██▏       | 17/79 [02:14<08:27,  8.19s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.531011	valid_1's auc: 0.533755
col : f_18
Training until validation scores don't improve for 100 rounds


 23%|██▎       | 18/79 [02:22<08:15,  8.13s/it]

Did not meet early stopping. Best iteration is:
[48]	training's auc: 0.509576	valid_1's auc: 0.510766
col : f_19
Training until validation scores don't improve for 100 rounds


 24%|██▍       | 19/79 [02:30<07:49,  7.83s/it]

Did not meet early stopping. Best iteration is:
[43]	training's auc: 0.596228	valid_1's auc: 0.596964
col : f_20
Training until validation scores don't improve for 100 rounds


 25%|██▌       | 20/79 [02:37<07:42,  7.84s/it]

Did not meet early stopping. Best iteration is:
[14]	training's auc: 0.659682	valid_1's auc: 0.658893
col : f_21
Training until validation scores don't improve for 100 rounds


 27%|██▋       | 21/79 [02:45<07:25,  7.68s/it]

Did not meet early stopping. Best iteration is:
[7]	training's auc: 0.592865	valid_1's auc: 0.592156
col : f_22
Training until validation scores don't improve for 100 rounds


 28%|██▊       | 22/79 [02:53<07:22,  7.75s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.562349	valid_1's auc: 0.561006
col : f_23
Training until validation scores don't improve for 100 rounds


 29%|██▉       | 23/79 [02:59<06:53,  7.38s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.533242	valid_1's auc: 0.533119
col : f_24
Training until validation scores don't improve for 100 rounds


 30%|███       | 24/79 [03:05<06:18,  6.89s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.533182	valid_1's auc: 0.533041
col : f_25
Training until validation scores don't improve for 100 rounds


 32%|███▏      | 25/79 [03:12<06:12,  6.90s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.533242	valid_1's auc: 0.533119
col : f_26
Training until validation scores don't improve for 100 rounds


 33%|███▎      | 26/79 [03:16<05:17,  5.99s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
col : f_27
Training until validation scores don't improve for 100 rounds


 34%|███▍      | 27/79 [03:20<04:51,  5.60s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
col : f_28
Training until validation scores don't improve for 100 rounds


 35%|███▌      | 28/79 [03:24<04:21,  5.12s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
col : f_29
Training until validation scores don't improve for 100 rounds


 37%|███▋      | 29/79 [03:29<04:02,  4.86s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
col : f_30
Training until validation scores don't improve for 100 rounds


 38%|███▊      | 30/79 [03:35<04:16,  5.23s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.596158	valid_1's auc: 0.599106
col : f_31
Training until validation scores don't improve for 100 rounds


 39%|███▉      | 31/79 [03:41<04:32,  5.67s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.596191	valid_1's auc: 0.599112
col : f_32
Training until validation scores don't improve for 100 rounds


 41%|████      | 32/79 [03:48<04:33,  5.82s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.504501	valid_1's auc: 0.501458
col : f_33
Training until validation scores don't improve for 100 rounds


 42%|████▏     | 33/79 [03:53<04:22,  5.70s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.508478	valid_1's auc: 0.508598
col : f_34
Training until validation scores don't improve for 100 rounds


 43%|████▎     | 34/79 [03:59<04:26,  5.93s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.503925	valid_1's auc: 0.503126
col : f_35
Training until validation scores don't improve for 100 rounds


 44%|████▍     | 35/79 [04:05<04:15,  5.80s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.51794	valid_1's auc: 0.51581
col : f_36
Training until validation scores don't improve for 100 rounds


 46%|████▌     | 36/79 [04:11<04:17,  5.98s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.503644	valid_1's auc: 0.502573
col : f_37
Training until validation scores don't improve for 100 rounds


 47%|████▋     | 37/79 [04:17<04:03,  5.79s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.513275	valid_1's auc: 0.513511
col : f_38
Training until validation scores don't improve for 100 rounds


 48%|████▊     | 38/79 [04:23<04:08,  6.05s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.519591	valid_1's auc: 0.519029
col : f_39
Training until validation scores don't improve for 100 rounds


 49%|████▉     | 39/79 [04:29<03:54,  5.86s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.500369	valid_1's auc: 0.5003
col : f_40
Training until validation scores don't improve for 100 rounds


 51%|█████     | 40/79 [04:34<03:37,  5.58s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.503119	valid_1's auc: 0.502573
col : f_41
Training until validation scores don't improve for 100 rounds


 52%|█████▏    | 41/79 [04:39<03:27,  5.46s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.501694	valid_1's auc: 0.501455
col : f_42
Training until validation scores don't improve for 100 rounds


 53%|█████▎    | 42/79 [04:51<04:31,  7.35s/it]

Did not meet early stopping. Best iteration is:
[50]	training's auc: 0.548541	valid_1's auc: 0.536535
col : f_43
Training until validation scores don't improve for 100 rounds


 54%|█████▍    | 43/79 [05:00<04:43,  7.87s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.990878	valid_1's auc: 0.990743
col : f_44
Training until validation scores don't improve for 100 rounds


 56%|█████▌    | 44/79 [05:06<04:15,  7.30s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.500445	valid_1's auc: 0.500203
col : f_45
Training until validation scores don't improve for 100 rounds


 57%|█████▋    | 45/79 [05:12<04:01,  7.09s/it]

Did not meet early stopping. Best iteration is:
[45]	training's auc: 0.500348	valid_1's auc: 0.500129
col : f_46
Training until validation scores don't improve for 100 rounds


 58%|█████▊    | 46/79 [05:19<03:45,  6.84s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.500145	valid_1's auc: 0.500244
col : f_47
Training until validation scores don't improve for 100 rounds


 59%|█████▉    | 47/79 [05:25<03:31,  6.59s/it]

Did not meet early stopping. Best iteration is:
[14]	training's auc: 0.500754	valid_1's auc: 0.500809
col : f_48
Training until validation scores don't improve for 100 rounds


 61%|██████    | 48/79 [05:30<03:16,  6.33s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.504463	valid_1's auc: 0.50413
col : f_49
Training until validation scores don't improve for 100 rounds


 62%|██████▏   | 49/79 [05:35<02:59,  5.98s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.503328	valid_1's auc: 0.502972
col : f_50
Training until validation scores don't improve for 100 rounds


 63%|██████▎   | 50/79 [05:41<02:51,  5.91s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.504971	valid_1's auc: 0.50455
col : f_51
Training until validation scores don't improve for 100 rounds


 65%|██████▍   | 51/79 [05:51<03:17,  7.07s/it]

Did not meet early stopping. Best iteration is:
[48]	training's auc: 0.992025	valid_1's auc: 0.991851
col : f_52
Training until validation scores don't improve for 100 rounds


 66%|██████▌   | 52/79 [05:59<03:17,  7.32s/it]

Did not meet early stopping. Best iteration is:
[50]	training's auc: 0.512025	valid_1's auc: 0.509082
col : f_53
Training until validation scores don't improve for 100 rounds


 67%|██████▋   | 53/79 [06:06<03:08,  7.25s/it]

Did not meet early stopping. Best iteration is:
[40]	training's auc: 0.509323	valid_1's auc: 0.507557
col : f_54
Training until validation scores don't improve for 100 rounds


 68%|██████▊   | 54/79 [06:14<03:04,  7.39s/it]

Did not meet early stopping. Best iteration is:
[47]	training's auc: 0.515103	valid_1's auc: 0.515472
col : f_55
Training until validation scores don't improve for 100 rounds


 70%|██████▉   | 55/79 [06:23<03:12,  8.02s/it]

Did not meet early stopping. Best iteration is:
[2]	training's auc: 0.531552	valid_1's auc: 0.52801
col : f_56
Training until validation scores don't improve for 100 rounds


 71%|███████   | 56/79 [06:31<03:03,  7.99s/it]

Did not meet early stopping. Best iteration is:
[49]	training's auc: 0.530516	valid_1's auc: 0.526581
col : f_57
Training until validation scores don't improve for 100 rounds


 72%|███████▏  | 57/79 [06:40<03:02,  8.30s/it]

Did not meet early stopping. Best iteration is:
[38]	training's auc: 0.527478	valid_1's auc: 0.519434
col : f_58
Training until validation scores don't improve for 100 rounds


 73%|███████▎  | 58/79 [06:49<02:59,  8.57s/it]

Did not meet early stopping. Best iteration is:
[42]	training's auc: 0.989374	valid_1's auc: 0.989259
col : f_59
Training until validation scores don't improve for 100 rounds


 75%|███████▍  | 59/79 [06:59<02:55,  8.76s/it]

Did not meet early stopping. Best iteration is:
[48]	training's auc: 0.988362	valid_1's auc: 0.988198
col : f_60
Training until validation scores don't improve for 100 rounds


 76%|███████▌  | 60/79 [07:05<02:32,  8.05s/it]

Did not meet early stopping. Best iteration is:
[50]	training's auc: 0.501089	valid_1's auc: 0.500546
col : f_61
Training until validation scores don't improve for 100 rounds


 77%|███████▋  | 61/79 [07:14<02:28,  8.27s/it]

Did not meet early stopping. Best iteration is:
[47]	training's auc: 0.526153	valid_1's auc: 0.520001
col : f_62
Training until validation scores don't improve for 100 rounds


 78%|███████▊  | 62/79 [07:23<02:28,  8.73s/it]

Did not meet early stopping. Best iteration is:
[44]	training's auc: 0.526848	valid_1's auc: 0.519981
col : f_63
Training until validation scores don't improve for 100 rounds


 80%|███████▉  | 63/79 [07:31<02:15,  8.45s/it]

Did not meet early stopping. Best iteration is:
[49]	training's auc: 0.527852	valid_1's auc: 0.525179
col : f_64
Training until validation scores don't improve for 100 rounds


 81%|████████  | 64/79 [07:41<02:10,  8.69s/it]

Did not meet early stopping. Best iteration is:
[49]	training's auc: 0.99261	valid_1's auc: 0.992389
col : f_65
Training until validation scores don't improve for 100 rounds


 82%|████████▏ | 65/79 [07:50<02:04,  8.92s/it]

Did not meet early stopping. Best iteration is:
[47]	training's auc: 0.988567	valid_1's auc: 0.988305
col : f_66
Training until validation scores don't improve for 100 rounds


 84%|████████▎ | 66/79 [08:00<01:58,  9.13s/it]

Did not meet early stopping. Best iteration is:
[5]	training's auc: 0.98957	valid_1's auc: 0.98937
col : f_67
Training until validation scores don't improve for 100 rounds


 85%|████████▍ | 67/79 [08:09<01:49,  9.14s/it]

Did not meet early stopping. Best iteration is:
[46]	training's auc: 0.988378	valid_1's auc: 0.988213
col : f_68
Training until validation scores don't improve for 100 rounds


 86%|████████▌ | 68/79 [08:17<01:36,  8.75s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.910576	valid_1's auc: 0.910801
col : f_69
Training until validation scores don't improve for 100 rounds


 87%|████████▋ | 69/79 [08:25<01:24,  8.49s/it]

Did not meet early stopping. Best iteration is:
[49]	training's auc: 0.911471	valid_1's auc: 0.912112
col : f_70
Training until validation scores don't improve for 100 rounds


 89%|████████▊ | 70/79 [08:34<01:18,  8.68s/it]

Did not meet early stopping. Best iteration is:
[50]	training's auc: 0.99072	valid_1's auc: 0.990632
col : f_71
Training until validation scores don't improve for 100 rounds


 90%|████████▉ | 71/79 [08:39<01:01,  7.75s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.508516	valid_1's auc: 0.508648
col : f_72
Training until validation scores don't improve for 100 rounds


 91%|█████████ | 72/79 [08:46<00:52,  7.53s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.507971	valid_1's auc: 0.504507
col : f_73
Training until validation scores don't improve for 100 rounds


 92%|█████████▏| 73/79 [08:54<00:45,  7.58s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.519444	valid_1's auc: 0.517416
col : f_74
Training until validation scores don't improve for 100 rounds


 94%|█████████▎| 74/79 [09:02<00:38,  7.60s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.506568	valid_1's auc: 0.505178
col : f_75
Training until validation scores don't improve for 100 rounds


 95%|█████████▍| 75/79 [09:12<00:33,  8.32s/it]

Did not meet early stopping. Best iteration is:
[49]	training's auc: 0.522377	valid_1's auc: 0.522194
col : f_76
Training until validation scores don't improve for 100 rounds


 96%|█████████▌| 76/79 [09:19<00:24,  8.11s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.520889	valid_1's auc: 0.520078
col : f_77
Training until validation scores don't improve for 100 rounds


 97%|█████████▋| 77/79 [09:24<00:14,  7.18s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.500369	valid_1's auc: 0.5003
col : f_78
Training until validation scores don't improve for 100 rounds


 99%|█████████▊| 78/79 [09:30<00:06,  6.73s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.503122	valid_1's auc: 0.502571
col : f_79
Training until validation scores don't improve for 100 rounds


100%|██████████| 79/79 [09:35<00:00,  7.28s/it]

Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.501694	valid_1's auc: 0.501456





In [19]:
# DataFrame 생성 및 점수 확인 
df = pd.DataFrame(adversarial_scores, columns=["adv_score"])
df["feature"] = features

In [21]:
df = df.sort_values(by="adv_score", ascending=False).reset_index(drop=True)
df

Unnamed: 0,adv_score,feature
0,1.000000,f_1
1,0.992389,f_64
2,0.991851,f_51
3,0.990743,f_43
4,0.990632,f_70
...,...,...
74,0.500000,f_29
75,0.500000,f_28
76,0.500000,f_27
77,0.500000,f_26
