# 구글 드라이브

In [1]:
from google import colab
colab.drive.mount("/content/drive")

Mounted at /content/drive


# 전역변수

In [2]:
DATA_PATH = '/content/drive/MyDrive/recommend/job_care/'
SUBMIT_PATH = '/content/drive/MyDrive/recommend/job_care/'
SEED = 42

# catboost install

In [3]:
!pip install catboost
!pip install optuna

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.1 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3
Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 14.1 MB/s 
Collecting cliff
  Downloading cliff-3.10.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 10.4 MB/s 
[?25hCollecting alembic
  Downloading alembic-1.7.5-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 85.3 MB/s 
[?25hCollecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 5.1 MB/s 
[?25hCollecting autopage>=0.4.0
  Downloading autopage-0.4.0-py3-none-any.whl (

# 라이브러리

In [4]:
import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np
import optuna 

import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 
from sklearn.model_selection import train_test_split

from catboost import Pool,CatBoostClassifier

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic
- python: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
- pandas: 1.1.5
- numpy: 1.19.5
- sklearn: 1.0.1


# 데이터 불러오기

In [5]:
train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test.csv')

code_d = pd.read_csv(f'{DATA_PATH}속성_D_코드_new.csv').iloc[:,:-1]
code_h = pd.read_csv(f'{DATA_PATH}속성_H_코드_new.csv')
code_l = pd.read_csv(f'{DATA_PATH}속성_L_코드_new.csv')

train_data.shape , test_data.shape

((501951, 35), (46404, 34))

# 속성 코드 데이터 컬럼명 변경


In [6]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m"]
code_h.columns= ["attribute_h","attribute_h_m","attribute_h_p"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

# 속성코드 데이터 merge 함수

In [7]:
def merge_codes(df:pd.DataFrame,df_code:pd.DataFrame,col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df,df_code,how="left",on=col)

# 데이터 전처리 함수

In [8]:
def preprocess_data(
                    df:pd.DataFrame,is_train:bool = True, cols_merge:List[Tuple[str,pd.DataFrame]] = []  , cols_equi:List[Tuple[str,str]]= [] ,
                    cols_drop:List[str] = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"]
                    )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df,df_code,col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    df = df.drop(columns=cols_drop)
    return (df , y_data)

# 전처리 컬럼명 정의

In [9]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    # ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    # ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),

]

# 학습에 필요없는 컬럼 리스트
# cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn"]
cols_drop = ['id', 'contents_open_dt', 'person_prefer_f', 'person_prefer_g', 'contents_rn', 
           'd_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn','h_m_match_yn', 
           'h_s_match_yn', 'person_attribute_a', 'person_attribute_a_1', 'contents_attribute_i',
            'contents_attribute_a', 'contents_attribute_k']

# 학습및 추론셋 전처리 !!

In [10]:
X, y = preprocess_data(train_data, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
X_test, _ = preprocess_data(test_data,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
X.shape , y.shape , X_test.shape

KeyError: ignored

# 범주형 컬럼 리스트(catboost 파라미터에 넣을 용도)

In [None]:
cat_features = X.columns[X.nunique() > 2].tolist()

In [None]:
['person_attribute_a_1',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3',
 'contents_attribute_i',
 'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_c',
 'contents_attribute_l',
 'contents_attribute_d',
 'contents_attribute_m',
 'contents_attribute_h',
 'person_rn',
 'person_prefer_d_1_attribute_d_d',
 'person_prefer_d_1_attribute_d_s',
 'person_prefer_d_1_attribute_d_m',
 'person_prefer_d_2_attribute_d_d',
 'person_prefer_d_2_attribute_d_s',
 'person_prefer_d_2_attribute_d_m',
 'person_prefer_d_3_attribute_d_d',
 'person_prefer_d_3_attribute_d_s',
 'person_prefer_d_3_attribute_d_m',
 'contents_attribute_d_attribute_d_d',
 'contents_attribute_d_attribute_d_s',
 'contents_attribute_d_attribute_d_m',
 'person_prefer_h_1_attribute_h_m',
 'person_prefer_h_1_attribute_h_p',
 'person_prefer_h_2_attribute_h_m',
 'person_prefer_h_2_attribute_h_p',
 'person_prefer_h_3_attribute_h_m',
 'person_prefer_h_3_attribute_h_p',
 'contents_attribute_h_attribute_h_m',
 'contents_attribute_h_attribute_h_p',
 'contents_attribute_l_attribute_l_d',
 'contents_attribute_l_attribute_l_s',
 'contents_attribute_l_attribute_l_m',
 'contents_attribute_l_attribute_l_l']

# 학습 파라미터

In [None]:
is_holdout = False
n_splits = 5
iterations = 3000
patience = 50

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

# 학습 시작!!

['Bayesian', 'Bernoulli', 'MVS']  
['Logloss', 'CrossEntropy']

In [None]:
# OPTUNA_OPTIMIZATION = True

# def objective(trial):
#     cat_features = X.columns[X.nunique() > 2].tolist()
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)
    
#     params = {
#         'iterations':trial.suggest_int("iterations", 1000, 20000),
#         'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
#         'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
#         'od_wait':trial.suggest_int('od_wait', 500, 2000),
#         'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
#         'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
#         'random_strength': trial.suggest_uniform('random_strength',10,50),
#         'depth': trial.suggest_int('depth',1,15),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
#         'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
#         'verbose': False,
#         'task_type' : 'GPU',
#         'devices' : '0',
#         'cat_features':cat_features,
#         "one_hot_max_size":trial.suggest_int('one_hot_max_size',1,15),
#         "eval_metric":"F1",
#     }
    
#     if params['bootstrap_type'] == 'Bayesian':
#         params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
#     elif params['bootstrap_type'] == 'Bernoulli':
#         params['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
#     model = CatBoostClassifier(**params)
#     model.fit(
#         X_train, y_train,
#         eval_set=[(X_test,y_test)],
#         early_stopping_rounds=100,
#         use_best_model=True
#     )
    
#     # validation prediction
#     pred = model.predict(X_test)
#     score = f1_score(y_test, pred)
    
#     return score

In [None]:
OPTUNA_OPTIMIZATION = True

def objective(trial):
    cat_features = X.columns[X.nunique() > 2].tolist()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)
    
    params = {
        'iterations':trial.suggest_int("iterations", 1000, 20000),
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'od_wait':trial.suggest_int('od_wait', 500, 2000),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1,15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'verbose': False,
        'task_type' : 'GPU',
        'devices' : '0',
        'cat_features':[
# 'person_attribute_a_1',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3',
#  'contents_attribute_i',
#  'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_c',
 'contents_attribute_l',
 'contents_attribute_d',
 'contents_attribute_m',
 'contents_attribute_h',
 'person_rn',
 'person_prefer_d_1_attribute_d_d',
 'person_prefer_d_1_attribute_d_s',
 'person_prefer_d_1_attribute_d_m',
 'person_prefer_d_2_attribute_d_d',
 'person_prefer_d_2_attribute_d_s',
 'person_prefer_d_2_attribute_d_m',
 'person_prefer_d_3_attribute_d_d',
 'person_prefer_d_3_attribute_d_s',
 'person_prefer_d_3_attribute_d_m',
 'contents_attribute_d_attribute_d_d',
 'contents_attribute_d_attribute_d_s',
 'contents_attribute_d_attribute_d_m',
 'person_prefer_h_1_attribute_h_m',
 'person_prefer_h_1_attribute_h_p',
 'person_prefer_h_2_attribute_h_m',
 'person_prefer_h_2_attribute_h_p',
 'person_prefer_h_3_attribute_h_m',
 'person_prefer_h_3_attribute_h_p',
 'contents_attribute_h_attribute_h_m',
 'contents_attribute_h_attribute_h_p',
 'contents_attribute_l_attribute_l_d',
 'contents_attribute_l_attribute_l_s',
 'contents_attribute_l_attribute_l_m',
 'contents_attribute_l_attribute_l_l'],
        "one_hot_max_size":trial.suggest_int('one_hot_max_size',1,15),
        "eval_metric":"F1",
    }
    
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    model = CatBoostClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test,y_test)],
        early_stopping_rounds=100,
        use_best_model=True
    )
    
    # validation prediction
    pred = model.predict(X_test)
    score = f1_score(y_test, pred)
    
    return score

In [None]:
study = optuna.create_study(
    direction='maximize',
    study_name='CatbClf'
)

study.optimize(
    objective, 
    n_trials=100
)

In [None]:
params=study.best_params
params['random_state'] = SEED
params['task_type'] = 'GPU' 
params['eval_metric'] = 'F1'
params['cat_features'] = cat_features

'iterations': 16028, 'objective': 'Logloss', 'bootstrap_type': 'Bernoulli', 'od_wait': 1458, 'learning_rate': 0.09515437950898221, 'reg_lambda': 77.72457345981381, 'random_strength': 46.25827529989489, 'depth': 8, 'min_data_in_leaf': 25, 'leaf_estimation_iterations': 5, 'one_hot_max_size': 5, 'subsample': 0.35912751913579133

In [None]:
scores = []
models = []


models = []
for tri, vai in cv.split(X):
    print("="*50)
    preds = []

    # model = CatBoostClassifier(iterations=16028,random_state=SEED,task_type="GPU",eval_metric="F1",cat_features=cat_features,one_hot_max_size=6,objective='Logloss', bootstrap_type='Bernoulli',od_wait=1458,learning_rate=0.09515437950898221,reg_lambda=77.72457345981381,random_strength=46.25827529989489,depth=8,min_data_in_leaf=25,leaf_estimation_iterations=5,subsample=0.35912751913579133)
    model = CatBoostClassifier(**params)
    model.fit(X.iloc[tri], y[tri], 
            eval_set=[(X.iloc[vai], y[vai])], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

0:	learn: 0.5322536	test: 0.5342194	best: 0.5342194 (0)	total: 19.3ms	remaining: 4m 26s
100:	learn: 0.6501845	test: 0.6618369	best: 0.6618369 (100)	total: 7.84s	remaining: 17m 43s
200:	learn: 0.6598628	test: 0.6730590	best: 0.6736492 (197)	total: 15.8s	remaining: 17m 47s
300:	learn: 0.6701502	test: 0.6836340	best: 0.6841236 (287)	total: 23.1s	remaining: 17m 13s
bestTest = 0.6851184297
bestIteration = 341
Shrink model to first 342 iterations.
0:	learn: 0.5325589	test: 0.5330005	best: 0.5330005 (0)	total: 21.1ms	remaining: 4m 51s
100:	learn: 0.6495489	test: 0.6623033	best: 0.6623346 (98)	total: 7.85s	remaining: 17m 45s
200:	learn: 0.6590462	test: 0.6725291	best: 0.6729162 (197)	total: 15.8s	remaining: 17m 49s
300:	learn: 0.6704597	test: 0.6852758	best: 0.6855055 (299)	total: 23.2s	remaining: 17m 19s
400:	learn: 0.6761508	test: 0.6856061	best: 0.6863823 (368)	total: 30.2s	remaining: 16m 49s
bestTest = 0.6863822767
bestIteration = 368
Shrink model to first 369 iterations.
0:	learn: 0.53264

0.6794092425  
0.6779555547  
0.6831937276  
0.6848302889  
0.6874167808

# CV 결과 확인

In [None]:
print(scores)
print(np.mean(scores))

[0.6851184297442673, 0.6863822766910205, 0.6827243153275837, 0.6787387711443257, 0.6778252947784795]
0.6821578175371353


# threshold 정의

In [None]:
threshold = 0.38

# threshold값 변경에 따른 검증점수 확인 및 추론

In [None]:
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(X) ):
    pred = models[i].predict_proba(X.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(X_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.7124865917126854, 0.7133160263891204, 0.7113950672359952, 0.7093115482233502, 0.7070014799104141]
0.7107021426943131


[0.7125135983345463, 0.7121587700031377, 0.7122172087977738, 0.7114860031287586, 0.7109675069096323]
0.7118686174347697

0.712636134378356

# 산술평균 앙상블!!

In [None]:
pred = np.mean( pred_list , axis = 0 )
pred = np.where(pred >= threshold , 1, 0)

# 제출파일!!

In [None]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = pred
sample_submission

# 저장

In [None]:
sample_submission.to_csv(f"{SUBMIT_PATH}catboost_optuna.csv", index=False)