In [None]:
import os, json
import random 
from datetime import datetime
from typing import List ,Dict, Tuple

import numpy as np
import pandas as pd
# from statsmodels.graphics.mosaicplot import mosaic
from matplotlib import pyplot as plt

import torch
from catboost import Pool,CatBoostClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

# print(f"- os: {platform.platform()}")
# print(f"- python: {sys.version}")
# print(f"- pandas: {pd.__version__}")
# print(f"- numpy: {np.__version__}")
# print(f"- sklearn: {sklearn.__version__}")

In [None]:
data_path = "/usr/src/coco/dacon/data/job_new"
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

code_d = pd.read_csv(os.path.join(data_path, '속성_D_코드.csv'))
code_h = pd.read_csv(os.path.join(data_path, '속성_H_코드.csv'))
code_l = pd.read_csv(os.path.join(data_path, '속성_L_코드.csv'))

submission_df = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))

In [None]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h","attribute_h_m","attribute_h_l"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

print(train_df.shape, test_df.shape, code_d.shape, code_h.shape, code_l.shape)
# train_df.head()

In [None]:
seed_no = 1000

def seed_everything(seed): # seed 고정
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

def merge_codes(df:pd.DataFrame, df_code:pd.DataFrame, col:str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)


def preprocess_data(
    df:pd.DataFrame, is_train:bool=True, 
    cols_merge:List[Tuple[str,pd.DataFrame]] = [], 
    cols_equi:List[Tuple[str,str]]= [] ,
    cols_drop:List[str] = ["id", "person_prefer_f", "person_prefer_g", "contents_open_dt"]
    )->Tuple[pd.DataFrame,np.ndarray]:
    
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    df = df.drop(columns=cols_drop)
    return (df , y_data)

# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [
    ("contents_attribute_c", "person_prefer_c"),
    ("contents_attribute_e", "person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    
    ("person_prefer_h_1_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_1_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_2_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_2_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_3_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_3_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    
    # ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    # ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    # ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
]

def split_data(x_train, y_train, idx):
    X_val = x_train.iloc[idx, ].values
    Y_val = y_train[idx]

    X_train = x_train.loc[~x_train.index.isin(idx)].values
    mask = np.ones(y_train.size, dtype=bool)
    mask[idx] = False
    Y_train = y_train[mask]

    print(X_train.shape, X_val.shape)
    print(Y_train.shape, Y_val.shape)
    return X_train, Y_train, X_val, Y_val

def get_thr(score, gt, n_iter=10000):
    f1s = []
    for i in range(1, n_iter):
        tmp_cutoff = i / n_iter
        f1 = f1_score(gt, (score[:, 1]>=tmp_cutoff)*1)
        f1s.append(f1)
    return f1s.index(max(f1s)) / n_iter, max(f1s)

# class F1_Score(Metric):
#     def __init__(self):
#         self._name = "f1"
#         self._maximize = True

#     def __call__(self, y_true, y_score):
#         score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
#         return score


def get_new_category(X, col, nonzero_importance):
    new_cat = np.zeros_like(X[col])
    for i, k in enumerate(X[col]):
        try:
            new_cat[i] = nonzero_importance[k] + 1
        except:
            continue
    return new_cat
            
            
def merge_categories(x_train, x_test, y_train, col, max_depth=10):
#     print(len(x_train[col].unique()))
    _x_train, _x_test = x_train.copy(), x_test.copy()
    enc = OneHotEncoder()
    x = _x_train[col].values.reshape(-1, 1)
    enc.fit(x)
    x = enc.transform(x).toarray()
            
    dt = DecisionTreeClassifier(random_state=seed_no, max_depth=max_depth)
    dt = dt.fit(x, y_train)
    nonzero_importance = np.where(dt.feature_importances_ != 0)[0]
    print(nonzero_importance)
    nonzero_importance = dict(zip(nonzero_importance, range(len(nonzero_importance))))
#     print(nonzero_importance)
    
    _x_train[col] = get_new_category(_x_train, col, nonzero_importance)
    _x_test[col] = get_new_category(_x_test, col, nonzero_importance)
        
    return _x_train, _x_test

seed_everything(seed_no)

In [None]:
# 학습에 필요없는 컬럼 리스트
cols_drop = [
    "id",
    "person_prefer_f", #only one value
    "person_prefer_g", #only one value
    "contents_open_dt",
    "person_rn",
    "contents_rn"]
try: 
    train_df = train_df.drop(columns=['contents_open_mm']) #mosaic plot
except:
    print("already removed col: contents_open_mm")

x_train, y_train = preprocess_data(train_df, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = preprocess_data(test_df,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train.shape , y_train.shape , x_test.shape

In [None]:
merge_cat = False
cutoff = 10

cat_idxs = []
cat_dims = []
ordinal_col = ['person_attribute_a_1', 'person_attribute_b', 'person_prefer_e', 'contents_attribute_e']
for idx, col in enumerate(x_train.columns):
    # if 'match' not in col or col not in ordinal_col:
    if col not in ordinal_col:
        n_cat = len(x_train[col].unique())
        if n_cat > cutoff and merge_cat:
            x_train, x_test = merge_categories(x_train, x_test, y_train, col, max_depth=cutoff)
            
        le = LabelEncoder()
        le.fit(x_train[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))

        x_train[col] = x_train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        x_test[col] = x_test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)
    
    if col in ordinal_col:
        x_train[col] = x_train[col] / max(x_train[col])
        x_test[col] = x_test[col] / max(x_test[col])

In [None]:
print(x_train.shape , y_train.shape , x_test.shape)
x_train[ordinal_col].describe()

# Catboost

In [None]:
cat_features = x_train.columns[cat_idxs].tolist()

In [None]:
n_fold=5
n_round = 2000
one_hot_max_size = 5
depth = 5
patience = 300

In [None]:
kf = KFold(n_splits=n_fold, random_state=seed_no, shuffle=True)

CV_f1, CV_info = [], []
fold_no = 1
for train_index, test_index in kf.split(x_train.values):
    train_X, val_X = x_train.iloc[train_index], x_train.iloc[test_index]
    train_Y, val_Y = y_train[train_index], y_train[test_index]
    
    model = CatBoostClassifier(
        iterations=n_round, 
        random_state=seed_no, 
        task_type="GPU",
        eval_metric="F1", 
        cat_features=cat_features, 
        depth=depth,
        one_hot_max_size=one_hot_max_size)
    
    model.fit(train_X, train_Y, 
            eval_set=[(val_X, val_Y)], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
    
    CV_f1.append(model.get_best_score())
    
    val_preds = model.predict_proba(val_X)
    best_thr = get_thr(val_preds, val_Y)
    print(f'best threshold and f1 score from fold {fold_no}-th validation set : {best_thr}')
    
    test_preds = model.predict_proba(x_test)
    submission_df[f'target_prob_cb_{fold_no}'] = test_preds[:,1]
    test_preds = (test_preds[:,1] >= best_thr[0]) * 1
    submission_df[f'target_cb_{fold_no}'] = test_preds
    
    CV_info.append({'fold':fold_no, 'val_threshold':best_thr[0], 'val_f1':best_thr[1]})
    # ckpt_path = os.path.join(save_dir, f"fold_{fold_no}")
    # final_Tabnet.save_model(ckpt_path)
    fold_no +=1
    


In [None]:
avg = np.mean([c["validation"]["F1"] for c in CV_f1])
print(CV_f1)
print(avg)

In [None]:
submission_df.head()

In [None]:
rowsum = submission_df[[f"target_cb_{i}" for i in range(1, n_fold+1)]].sum(axis=1)
sum(rowsum==n_fold/2)

In [None]:
cutoff = n_fold/2
# cutoff = 3

binary_result = []
for i, r in enumerate(rowsum):
    if r > cutoff:
        binary_result.append(1)
    elif r <= cutoff:
        binary_result.append(0)

In [None]:
print(binary_result[:10])

In [None]:
submission_df["target"] = binary_result
res_df = submission_df[["id", "target"]]
res_df.describe()

In [None]:
res_df.to_csv(f'{data_path}/results/cb8.csv', index=False)