In [1]:
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()


def preprocess_data(
    df: pd.DataFrame,
    cols_merge: List[Tuple[str, pd.DataFrame]],
    cols_equi: List[Tuple[str, str]],
    cols_drop: List[str],
    is_train: bool = True,
) -> Tuple[pd.DataFrame, np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"]
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df, df_code, col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2]).astype(np.int8)

    df = df.drop(columns=cols_drop)
    return df, y_data


def merge_codes(df: pd.DataFrame, df_code: pd.DataFrame, col: str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how="left", on=col)


def load_dataset(path: str) -> Tuple[pd.DataFrame]:
    train = pd.read_csv(path + "train.csv")
    test = pd.read_csv(path + "test.csv")
    code_d = pd.read_csv(path + "속성_D_코드.csv")
    code_h = pd.read_csv(path + "속성_H_코드.csv")
    code_l = pd.read_csv(path + "속성_L_코드.csv")

    code_d.columns = [
        "attribute_d",
        "attribute_d_d",
        "attribute_d_s",
        "attribute_d_m",
        "attribute_d_l",
    ]
    code_h.columns = ["attribute_h", "attribute_h_l", "attribute_h_m"]
    code_l.columns = [
        "attribute_l",
        "attribute_l_d",
        "attribute_l_s",
        "attribute_l_m",
        "attribute_l_l",
    ]
    # 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
    cols_merge = [
        ("person_prefer_d_1", code_d),
        ("person_prefer_d_2", code_d),
        ("person_prefer_d_3", code_d),
        ("contents_attribute_d", code_d),
        ("person_prefer_h_1", code_h),
        ("person_prefer_h_2", code_h),
        ("person_prefer_h_3", code_h),
        ("contents_attribute_h", code_h),
        ("contents_attribute_l", code_l),
    ]

    # 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
    cols_equi = [
        ("contents_attribute_c", "person_prefer_c"),
        ("contents_attribute_e", "person_prefer_e"),
        ("person_prefer_d_2_attribute_d_s", "contents_attribute_d_attribute_d_s"),
        ("person_prefer_d_2_attribute_d_m", "contents_attribute_d_attribute_d_m"),
        ("person_prefer_d_2_attribute_d_l", "contents_attribute_d_attribute_d_l"),
        ("person_prefer_d_3_attribute_d_s", "contents_attribute_d_attribute_d_s"),
        ("person_prefer_d_3_attribute_d_m", "contents_attribute_d_attribute_d_m"),
        ("person_prefer_d_3_attribute_d_l", "contents_attribute_d_attribute_d_l"),
        ("person_prefer_h_1_attribute_h_m", "contents_attribute_h_attribute_h_m"),
        ("person_prefer_h_2_attribute_h_m", "contents_attribute_h_attribute_h_m"),
        ("person_prefer_h_3_attribute_h_m", "contents_attribute_h_attribute_h_m"),
        ("person_prefer_h_1_attribute_h_l", "contents_attribute_h_attribute_h_l"),
        ("person_prefer_h_2_attribute_h_l", "contents_attribute_h_attribute_h_l"),
        ("person_prefer_h_3_attribute_h_l", "contents_attribute_h_attribute_h_l"),
    ]

    # 학습에 필요없는 컬럼 리스트
    cols_drop = [
        "id",
        "person_prefer_f",
        "person_prefer_g",
        "contents_open_dt",
        "contents_rn",
    ]

    train, target = preprocess_data(
        train, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop
    )
    test, _ = preprocess_data(
        test,
        cols_merge=cols_merge,
        cols_equi=cols_equi,
        cols_drop=cols_drop,
        is_train=False,
    )

    return train, test, target


In [2]:
train, test, target = load_dataset("../input/jobcare-recommendation/")
train.head()

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,person_prefer_d_2_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_d_3_attribute_d_s_contents_attribute_d_attribute_d_s,person_prefer_d_3_attribute_d_m_contents_attribute_d_attribute_d_m,person_prefer_d_3_attribute_d_l_contents_attribute_d_attribute_d_l,person_prefer_h_1_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_2_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_3_attribute_h_m_contents_attribute_h_attribute_h_m,person_prefer_h_1_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_2_attribute_h_l_contents_attribute_h_attribute_h_l,person_prefer_h_3_attribute_h_l_contents_attribute_h_attribute_h_l
0,1,1,1,0,0,0,1,4,3,5,...,1,0,0,1,0,1,0,0,0,0
1,0,0,0,1,1,0,1,3,4,1,...,0,0,0,0,1,1,1,1,0,0
2,0,0,0,1,0,0,2,0,3,5,...,1,0,0,0,1,0,1,0,0,0
3,0,0,0,1,0,0,2,0,2,5,...,0,0,0,0,1,0,0,0,0,0
4,1,1,1,0,0,0,1,3,4,5,...,1,0,0,1,0,0,0,0,0,0


In [8]:
from typing import Dict, List, Tuple, Union

from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from lightgbm import early_stopping, log_evaluation


def f1_eval(
    y_true: np.ndarray, y_pred: np.ndarray, thershold: float = 0.4
) -> Tuple[Union[str, float, bool]]:
    y_labels = (y_pred > thershold).astype(np.int8)
    return "f1", f1_score(y_labels, y_true), True


def stratified_kfold_lgbm(
    params: Dict[str, Union[int, float, str]],
    fold: int,
    X: pd.DataFrame,
    y: pd.DataFrame,
    X_test: pd.DataFrame,
    threshold: float = 0.38,
    verbose: Union[int, bool] = False,
) -> np.ndarray:
    folds = StratifiedKFold(n_splits=fold, shuffle=True, random_state=42)
    splits = folds.split(X, y)
    lgb_oof = np.zeros(X.shape[0])
    lgb_preds = np.zeros(X_test.shape[0])

    for fold, (train_idx, valid_idx) in enumerate(splits, 1):
        if verbose:
            print(f"\tFold {fold}\n")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = LGBMClassifier(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            callbacks=[early_stopping(100), log_evaluation(verbose)],
            eval_metric=lambda y_true, y_pred: f1_eval(y_true, y_pred, threshold),
        )
        lgb_oof[valid_idx] = model.predict_proba(X_valid)[:, 1]
        lgb_preds += model.predict_proba(X_test)[:, 1] / fold

    f1 = f1_score(y, lgb_oof > threshold)
    print(f"F1 Score: {f1_score:.5f}")

    return lgb_preds

In [9]:
params = {
    "n_estimators": 10000,
    "boosting_type": "gbdt",
    "objective": "binary",
    "random_state": 42,
    "learning_rate": 0.05,
    "num_leaves": 5,
    "max_bin": 55,
    "subsample": 0.8,
    "min_child_weight": 11,
}

lgbm_preds = stratified_kfold_lgbm(params, 5, train, target, test, verbose=100)

	Fold 1

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.663506	training's f1: 0.679336	valid_1's binary_logloss: 0.663225	valid_1's f1: 0.679677
[200]	training's binary_logloss: 0.658522	training's f1: 0.681762	valid_1's binary_logloss: 0.658084	valid_1's f1: 0.682096
[300]	training's binary_logloss: 0.655844	training's f1: 0.682751	valid_1's binary_logloss: 0.655444	valid_1's f1: 0.682738
[400]	training's binary_logloss: 0.653957	training's f1: 0.683625	valid_1's binary_logloss: 0.653687	valid_1's f1: 0.683524
[500]	training's binary_logloss: 0.652434	training's f1: 0.684512	valid_1's binary_logloss: 0.652283	valid_1's f1: 0.684014
[600]	training's binary_logloss: 0.651188	training's f1: 0.685228	valid_1's binary_logloss: 0.651147	valid_1's f1: 0.68462
[700]	training's binary_logloss: 0.650127	training's f1: 0.68578	valid_1's binary_logloss: 0.65025	valid_1's f1: 0.685296
[800]	training's binary_logloss: 0.649269	training's f1: 0.68641

[1800]	training's binary_logloss: 0.642412	training's f1: 0.690294	valid_1's binary_logloss: 0.647749	valid_1's f1: 0.687452
[1900]	training's binary_logloss: 0.641967	training's f1: 0.690551	valid_1's binary_logloss: 0.647538	valid_1's f1: 0.687609
[2000]	training's binary_logloss: 0.64153	training's f1: 0.690828	valid_1's binary_logloss: 0.647325	valid_1's f1: 0.687831
[2100]	training's binary_logloss: 0.641111	training's f1: 0.690982	valid_1's binary_logloss: 0.647114	valid_1's f1: 0.687841
Early stopping, best iteration is:
[2009]	training's binary_logloss: 0.641497	training's f1: 0.690829	valid_1's binary_logloss: 0.647308	valid_1's f1: 0.687893
	Fold 5

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.663642	training's f1: 0.679651	valid_1's binary_logloss: 0.662718	valid_1's f1: 0.679047
[200]	training's binary_logloss: 0.658609	training's f1: 0.682066	valid_1's binary_logloss: 0.657762	valid_1's f1: 0.681502
[300]	training's binar

UnboundLocalError: local variable 'f1_score' referenced before assignment

In [None]:
submission = pd.read_csv("../input/jobcare-recommendation/sample_submission.csv")
submission["target"] = lgbm_preds
submission.to_csv("first_submission.csv", index=False)