In [None]:
#!pip install pytorch-tabnet==3.1.1
#!pip install optuna

import os
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import json
import random 
from datetime import datetime
from typing import List ,Dict, Tuple

import numpy as np
import pandas as pd
# from statsmodels.graphics.mosaicplot import mosaic
from matplotlib import pyplot as plt

import torch
from torch import nn
from pytorch_tabnet.tab_model  import TabNetClassifier 
from pytorch_tabnet.metrics import Metric

from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [None]:
data_path = "/content/drive/MyDrive/dacon/data"
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

code_d = pd.read_csv(os.path.join(data_path, '속성_D_코드.csv'))
code_h = pd.read_csv(os.path.join(data_path, '속성_H_코드.csv'))
code_l = pd.read_csv(os.path.join(data_path, '속성_L_코드.csv'))

submission_df = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))

In [None]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h","attribute_h_m","attribute_h_l"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

print(train_df.shape, test_df.shape, code_d.shape, code_h.shape, code_l.shape)
# train_df.head()

In [None]:
seed_no = 1000

def seed_everything(seed): # seed 고정
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

def merge_codes(df:pd.DataFrame, df_code:pd.DataFrame, col:str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)


def preprocess_data(
    df:pd.DataFrame, is_train:bool=True, 
    cols_merge:List[Tuple[str,pd.DataFrame]] = [], 
    cols_equi:List[Tuple[str,str]]= [] ,
    cols_drop:List[str] = ["id", "person_prefer_f", "person_prefer_g", "contents_open_dt"]
    )->Tuple[pd.DataFrame,np.ndarray]:
    
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    df = df.drop(columns=cols_drop)
    return (df , y_data)

# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [
    ("contents_attribute_c", "person_prefer_c"),
    ("contents_attribute_e", "person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    
    ("person_prefer_h_1_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_1_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_2_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_2_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_3_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_3_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    
    # ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    # ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    # ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
]

def split_data(x_train, y_train, idx):
    X_val = x_train.iloc[idx, ].values
    Y_val = y_train[idx]

    X_train = x_train.loc[~x_train.index.isin(idx)].values
    mask = np.ones(y_train.size, dtype=bool)
    mask[idx] = False
    Y_train = y_train[mask]

    print(X_train.shape, X_val.shape)
    print(Y_train.shape, Y_val.shape)
    return X_train, Y_train, X_val, Y_val

def get_thr(score, gt, n_iter=10000):
    f1s = []
    for i in range(1, n_iter):
        tmp_cutoff = i / n_iter
        f1 = f1_score(gt, (score[:, 1]>=tmp_cutoff)*1)
        f1s.append(f1)
    return f1s.index(max(f1s)) / n_iter, max(f1s)

class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score


seed_everything(seed_no)

In [None]:
train_df["contents_open_mm"] = train_df["contents_open_dt"].apply(lambda x: pd.Timestamp(x).month)
# mosaic(train_df.sort_values('contents_open_mm'), ['contents_open_mm', 'target'], 
#       title='Mosaic Chart')
# plt.show()

In [None]:
# 학습에 필요없는 컬럼 리스트
cols_drop = [
    "id",
    "person_prefer_f", #only one value
    "person_prefer_g", #only one value
    "contents_open_dt",
    "person_rn",
    "contents_rn"]
try: 
    train_df = train_df.drop(columns=['contents_open_mm']) #mosaic plot
except:
    print("already removed col: contents_open_mm")

x_train, y_train = preprocess_data(train_df, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = preprocess_data(test_df,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train.shape , y_train.shape , x_test.shape

In [None]:
cat_idxs = []
cat_dims = []
ordinal_col = ['person_attribute_a_1', 'person_attribute_b', 'person_prefer_e', 'contents_attribute_e']
for idx, col in enumerate(x_train.columns):
    if 'match' not in col or col not in ordinal_col: 
        le = LabelEncoder()
        le.fit(x_train[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))

        x_train[col] = x_train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        x_test[col] = x_test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)
    
    if col in ordinal_col:
        x_train[col] = x_train[col] / max(x_train[col])
        x_test[col] = x_test[col] / max(x_test[col])

In [None]:
x_train[ordinal_col].describe()

# TabNet hyperparameter tuning

In [None]:
n_fold_opt = 5
n_fold = 10
n_trials = 50

n_workers = 2
patience_cv = 20
patience = 50
max_epoch = 500
lr = 2e-2
batch_size = 1024
virtual_batch_size = 256
time_out_hour = 12
eval_metric = ['f1']

def Objective(trial):
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 32, 64, step=8)
    n_steps = trial.suggest_int("n_steps", 1, 5, step=1)
    gamma = trial.suggest_float("gamma", 1.1, 1.4, step=0.1)
    n_shared = trial.suggest_int("n_shared", 1, 3)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    cat_emb_dim = trial.suggest_int("cat_emb_dim", 1, 5, step=2)
    bs = trial.suggest_int("batch_size", virtual_batch_size, 1024, step=virtual_batch_size)
    
    tabnet_params = dict(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=cat_emb_dim,
        n_d=n_da, 
        n_a=n_da, 
        n_steps=n_steps, 
        gamma=gamma,
        lambda_sparse=lambda_sparse, 
        optimizer_fn=torch.optim.AdamW,
        optimizer_params=dict(lr=lr),
        scheduler_params = {"gamma": 0.95, "step_size": 10},
        mask_type=mask_type, 
        n_shared=n_shared,
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        seed=seed_no,
        verbose=0,
        ) #early stopping
    
    kf = KFold(n_splits=n_fold_opt, random_state=seed_no, shuffle=True)
    
    CV_scores = []
    for train_index, test_index in kf.split(x_train.values):
        X_train, X_valid = x_train.values[train_index], x_train.values[test_index]
        Y_train, Y_valid = y_train[train_index], y_train[test_index]
        TabNet = TabNetClassifier(**tabnet_params)
        TabNet.fit(
            X_train=X_train, 
            y_train=Y_train,
            eval_set=[(X_valid, Y_valid)],
            patience=patience_cv,
            max_epochs=max_epoch,
            eval_metric=eval_metric,
            batch_size=bs,
            virtual_batch_size=virtual_batch_size,
            num_workers=n_workers,
            drop_last=False,
        )
        
        CV_scores.append(TabNet.best_cost)
    avg = np.mean(CV_scores)
    return avg

In [None]:
tpe_sampler = TPESampler(seed=seed_no)
study = optuna.create_study(direction="maximize", study_name='TabNet optimization', sampler=tpe_sampler)
# study.optimize(Objective, n_trials=n_trials, timeout=time_out_hour*3600)
study.optimize(Objective, n_trials=n_trials)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_contour(
    study,
    params=[
        "n_da", "cat_emb_dim", "n_steps", "gamma", "lambda_sparse", "mask_type", "n_shared"
    ],
)

# Training with CV (Ensemble)

In [None]:
TabNet_params = study.best_params
final_params = dict(
    n_d=TabNet_params['n_da'], 
    n_a=TabNet_params['n_da'],
    cat_emb_dim=TabNet_params['cat_emb_dim'],
    n_steps=TabNet_params['n_steps'], 
    gamma=TabNet_params['gamma'],
    lambda_sparse=TabNet_params['lambda_sparse'], 
    optimizer_fn=torch.optim.AdamW,
    mask_type=TabNet_params['mask_type'], 
    n_shared=TabNet_params['n_shared'],
    optimizer_params=dict(lr=lr),
    batch_size=TabNet_params["batch_size"]
    scheduler_params = {"gamma": 0.95, "step_size": 10},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    seed=seed_no,
    verbose=0,
)
print(final_params)

In [None]:
strtime = datetime.now().strftime("%Y%m%d%H%M")
save_dir = f"{data_path}/ckpt/{strtime}"
os.makedirs(save_dir, exist_ok=True)


parameter_path = f'{save_dir}/tuned_params.json'
with open(parameter_path, "w") as f:
    json.dump(final_params, f)

In [None]:
with open(parameter_path, "r") as json_data:
    final_params = json.load(json_data)

kf = KFold(n_splits=n_fold, random_state=seed_no, shuffle=True)
    
CV_f1, CV_info = [], []
fold_no = 1
for train_index, test_index in kf.split(x_train.values):
    train_X, val_X = x_train.values[train_index], x_train.values[test_index]
    train_Y, val_Y = y_train[train_index], y_train[test_index]
    
    final_Tabnet = TabNetClassifier(**final_params)
    final_Tabnet.fit(
        X_train=train_X, 
        y_train=train_Y,
        eval_set=[(train_X, train_Y), (val_X, val_Y)],
        eval_name = ['train', 'val'],
        patience=patience,
        max_epochs=max_epoch,
        eval_metric=eval_metric,
        batch_size=final_params["batch_size"], #batch_size,
        virtual_batch_size=virtual_batch_size,
        num_workers=n_workers,
        drop_last=False,
    )
    
    CV_f1.append(final_Tabnet.best_cost)
    
    val_preds = final_Tabnet.predict_proba(val_X)
    best_thr = get_thr(val_preds, val_Y)
    print(f'best threshold and f1 score from fold {fold_no}-th validation set : {best_thr}')
    
    test_preds = final_Tabnet.predict_proba(x_test.values)
    test_preds = (test_preds[:,1] >= best_thr[0]) * 1
    submission_df[f'target_{fold_no}'] = test_preds
    
    CV_info.append({'fold':fold_no, 'val_threshold':best_thr[0], 'val_f1':best_thr[1]})
    ckpt_path = os.path.join(save_dir, f"fold_{fold_no}")
    final_Tabnet.save_model(ckpt_path)
    fold_no +=1
    
avg = np.mean(CV_f1)

In [None]:
submission_df.head()