# XGBoost 분류 문제 예시

보통 `xgb`로 불러서 사용합니다.

In [None]:
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import requests

import pandas as pd
import seaborn as sns
import sklearn.feature_selection
import sklearn.metrics
import sklearn.model_selection
import xgboost as xgb
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from xgboost import XGBClassifier

from sklearn.experimental import enable_hist_gradient_boosting  # isort:skip
from sklearn.ensemble import RandomForestClassifier  # isort:skip

%matplotlib inline
%config InlineBackend.figure_formats = {"png", "retina"}
mpl.rcParams["figure.dpi"] = 150
mpl.rcParams["figure.constrained_layout.use"] = True
pd.plotting.register_matplotlib_converters()

In [None]:
data_dir = Path() / "data"
data_dir.mkdir(parents=True, exist_ok=True)

## 데이터 준비

[타이타닉 데이터셋](https://github.com/alexisperrier/packt-aml/blob/master/ch4/titanic.csv)을 이용합니다.

In [None]:
# Download and save the file
titanic_path = data_dir / "titanic.csv"

if not titanic_path.exists():
    response = requests.get(
        "https://raw.githubusercontent.com/alexisperrier/packt-aml/master/ch4/titanic.csv"
    )
    with open(titanic_path, "wb") as f:
        f.write(response.content)

In [None]:
titanic_temp = pd.read_csv(titanic_path)

titanic_temp

In [None]:
titanic_temp.info()

In [None]:
titanic_temp.describe()

In [None]:
titanic_temp["pclass"].unique()

In [None]:
titanic_temp["pclass"].value_counts()

In [None]:
fig, ax = plt.subplots()

sns.histplot(titanic_temp["pclass"].astype(str))
pass

In [None]:
titanic_temp["sex"].unique()

In [None]:
titanic_temp["sex"].value_counts()

In [None]:
fig, ax = plt.subplots()

ax.hist(titanic_temp["age"])
pass

In [None]:
titanic_temp["fare"].value_counts()

In [None]:
titanic_temp.isna().sum()

피처 중 `fare`와 `embarked`는 결손치가 각각 1개, 2개가 있는데, 결손치가 있는 데이터는 아예 제외하겠습니다.
`age`에는 결손치가 많이 있고, 따로 처리하지 않고 그대로 사용합니다.

In [None]:
numeric_feature_names = ["age", "sibsp", "parch", "fare"]
categorical_feature_names = ["pclass", "sex", "embarked"]
feature_names = numeric_feature_names + categorical_feature_names
target_name = "survived"

titanic_raw = (
    pd.read_csv(
        titanic_path,
        usecols=(feature_names + [target_name]),
        dtype={
            **{k: "category" for k in categorical_feature_names},
            "survived": float,
        },
    )
    .dropna(subset=["fare", "embarked"])
    .reset_index(drop=True)
    .reindex(columns=(feature_names + [target_name]))
)

titanic_raw.info()
titanic_raw.head()

피처의 타입별로 데이터를 확인할 수 있습니다.

In [None]:
titanic_raw.select_dtypes("number")

In [None]:
titanic_raw.select_dtypes(exclude="number")

카테고리를 확인합니다.

In [None]:
for c, series in titanic_raw.select_dtypes("category").items():
    print(c, series.cat.categories)

학습 세트와 평가 세트로 분할합니다.

In [None]:
(
    x_train_raw,
    x_test_raw,
    y_train_raw,
    y_test_raw,
) = sklearn.model_selection.train_test_split(
    titanic_raw[feature_names],
    titanic_raw[target_name],
    test_size=0.2,
    random_state=78,
)

In [None]:
x_train_raw.shape, x_test_raw.shape, y_train_raw.shape, y_test_raw.shape

## 데이터 전처리

트리 모델이라 수치 데이터의 정규화는 하지 않고, 범주형 데이터의 one-hot 인코딩만 진행합니다.

Scikit-learn에서 제공하는 `ColumnTransformer`, [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) 등을 쓸 수도 있지만, 열 이름을 다 떼어 버리기 때문에, 여기서는 `pd.get_dummies`를 이용하겠습니다.

원래는 fit-transform 과정을 거쳐야 하지만, 간단하게 하기 위해 이미 정보를 안다고 가정하고 수동으로 처리하겠습니다.

- `pclass`: 0, 1, 2
- `sex`: `female`, `male` 2가지이지만, 2열로 하지 않고 `male`은 떼어 버림
- `embarked`: C, Q, S

In [None]:
def titanic_one_hot_encoder(x, y=None):
    numeric_df = x[numeric_feature_names]
    one_hot_encoded_df = (
        pd.get_dummies(x[categorical_feature_names])
        .drop(columns="sex_male")
        .rename(columns=str.lower)
    )
    new_x = pd.concat((numeric_df, one_hot_encoded_df), axis=1)
    return new_x

In [None]:
x_train = titanic_one_hot_encoder(x_train_raw)
x_test = titanic_one_hot_encoder(x_test_raw)
y_train = y_train_raw.copy()
y_test = y_test_raw.copy()

display(x_train.head())
display(x_test.head())

⚠️ [Min-max scale](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html), [정규화](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) 등을 할 때엔, 학습 세트와 평가 세트를 따로 하지 않도록 조심해야 합니다.

In [None]:
rng = np.random.default_rng(seed=42)

example_train_set = 5 * rng.random((20, 2)) + 5
example_test_set = 5 * rng.random((3, 2)) + 5

fig, ax = plt.subplots()

ax.plot(example_train_set[:, 0], example_train_set[:, 1], ".")
ax.plot(example_test_set[:, 0], example_test_set[:, 1], ".")

In [None]:
# Wrong!
scaled_train_set = MinMaxScaler().fit_transform(example_train_set)
scaled_test_set = MinMaxScaler().fit_transform(example_test_set)

In [None]:
fig, ax = plt.subplots()

ax.plot(scaled_train_set[:, 0], scaled_train_set[:, 1], ".")
ax.plot(scaled_test_set[:, 0], scaled_test_set[:, 1], ".")

In [None]:
scaler = MinMaxScaler()

scaled_train_set = scaler.fit_transform(example_train_set)
scaled_test_set = scaler.transform(example_test_set)

In [None]:
fig, ax = plt.subplots()

ax.plot(scaled_train_set[:, 0], scaled_train_set[:, 1], ".")
ax.plot(scaled_test_set[:, 0], scaled_test_set[:, 1], ".")

## 학습 (기본 XGBoost API 이용)

[XGBoost Parameters](https://xgboost.readthedocs.io/en/latest/parameter.html)

In [None]:
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "eta": 0.1,  # {0.3} learning rate
    "gamma": 1.0,  # {0} Minimum loss reduction
    "max_depth": 5,  # {6}
    "subsample": 0.5,  # {1} Row (sample) subsample ratio
    "colsample_bytree": 0.5,  # {1} Column (feature) subsample ratio
    "lambda": 1,  # {1} L2 regularization term
    "verbosity": 0,
}

In [None]:
def get_scores(y_true, y_pred):
    return {
        "accuracy": sklearn.metrics.accuracy_score(y_true, y_pred > 0.5),
        "log_loss": sklearn.metrics.log_loss(y_true, y_pred),
        "roc_auc": sklearn.metrics.roc_auc_score(y_true, y_pred),
    }

XGBoost는 자체적으로 `DMatrix` 형식의 데이터를 씁니다. ([Core Data Structure](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.core))

In [None]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [None]:
num_round = 200

bst = xgb.train(params, dtrain, num_boost_round=num_round)

In [None]:
y_pred_train = bst.predict(dtrain)
y_pred_test = bst.predict(dtest)

In [None]:
print("Train", get_scores(y_train, y_pred_train))
print("Test", get_scores(y_test, y_pred_test))

In [None]:
# Random guessing score
print(
    "0.49, 0.51 random guess",
    get_scores(y_train, 0.49 + 0.02 * np.random.randint(2, size=y_train.shape)),
)
print(
    "0.01, 0.99 random guess",
    get_scores(y_train, 0.01 + 0.98 * np.random.randint(2, size=y_train.shape)),
)

XGBoost에서 기본으로 제공하는 `plot_importance`를 이용해 F-score를 확인해 봅니다.

In [None]:
fig, ax = plt.subplots()

xgb.plot_importance(bst, importance_type="total_gain", ax=ax)
pass

부스터(학습된 모델)를 저장하고 불러오거나, 학습을 이어할 수 있습니다.

In [None]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [None]:
num_round = 200
num_round_per_loop = 5

bst = None
train_scores = []
test_scores = []
for i in range(num_round_per_loop, num_round + 1, num_round_per_loop):
    bst = xgb.train(params, dtrain, num_boost_round=num_round_per_loop, xgb_model=bst)

    y_pred_train = bst.predict(dtrain)
    y_pred_test = bst.predict(dtest)
    train_scores.append(
        {
            "iteration": i,
            **{f"train_{k}": v for k, v in get_scores(y_train, y_pred_train).items()},
        }
    )
    test_scores.append(
        {
            "iteration": i,
            **{f"test_{k}": v for k, v in get_scores(y_test, y_pred_test).items()},
        }
    )

train_scores_df = pd.DataFrame(train_scores).set_index("iteration")
test_scores_df = pd.DataFrame(test_scores).set_index("iteration")

In [None]:
pd.concat((train_scores_df, test_scores_df), axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

train_scores_df.plot(marker=".", ax=ax)
ax.set_prop_cycle(None)  # reset color cycle
test_scores_df.plot(marker=".", ls=":", ax=ax)
ax.grid()

### Early stopping

검증 세트를 주면 검증 세트의 점수가 더 나아지지 않으면 일찍 학습을 끊을수 있습니다.

In [None]:
(
    x_train_train,
    x_train_valid,
    y_train_train,
    y_train_valid,
) = sklearn.model_selection.train_test_split(x_train, y_train)

dtrain = xgb.DMatrix(x_train_train, label=y_train_train)
dvalid = xgb.DMatrix(x_train_valid, label=y_train_valid)
dtest = xgb.DMatrix(x_test, label=y_test)

In [None]:
num_round = 200

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=num_round,
    evals=[(dvalid, "validset")],
    early_stopping_rounds=20,
)

## Scikit-learn wrapper API

XGBoost에서는 scikit-learn의 pipeline 등에 넣어서 scikit-learn의 estimator처럼 쓸 수 있는 wrapper 인터페이스를 제공합니다.

In [None]:
titanic_transformer = FunctionTransformer(titanic_one_hot_encoder)

In [None]:
titanic_transformer.fit(x_train_raw)

one_hot_encoded_column_names = titanic_transformer.transform(x_train_raw).columns

display(titanic_transformer.transform(x_train_raw).head())
display(titanic_transformer.transform(x_test_raw).head())

In [None]:
params = {
    "objective": "binary:logistic",
    "n_estimators": 200,
    "learning_rate": 0.3,  # {0.3} learning rate
    "gamma": 1.0,  # {0} Minimum loss reduction
    "max_depth": 5,  # {6}
    "subsample": 0.5,  # {1} Row (sample) subsample ratio
    "colsample_bytree": 1.0,  # {1} Column (feature) subsample ratio
    "reg_lambda": 1,  # {1} L2 regularization term
    "verbosity": 0,
}

In [None]:
pipe = Pipeline(
    [
        ("one_hot", FunctionTransformer(titanic_one_hot_encoder)),
        # ("mean_imputer", SimpleImputer()),
        # ("dim_reducer", SelectKBest(sklearn.feature_selection.chi2)),
        ("clf", XGBClassifier(**params)),
        # ("clf", HistGradientBoostingClassifier()),
    ]
)
pipe

In [None]:
x_train, x_valid, y_train, y_valid = sklearn.model_selection.train_test_split(
    x_train_raw, y_train_raw
)

x_test = x_test_raw.copy()
y_test = y_test_raw.copy()

In [None]:
x_train.head()

In [None]:
pipe.fit(x_train, y_train)
pass

In [None]:
get_scores(y_test, pipe.predict_proba(x_test)[:, 1])

### 파라미터 최적화

In [None]:
param_grid = {
    # "dim_reducer__k": [5, 7],
    "clf__learning_rate": [0.01, 0.03, 0.1],
    "clf__colsample_bytree": [0.3, 0.5, 1.0],
}

In [None]:
gs = GridSearchCV(pipe, param_grid, scoring="neg_log_loss")

gs.fit(x_train, y_train)
pass

In [None]:
gs.predict(x_test)

In [None]:
get_scores(y_test, gs.predict_proba(x_test)[:, 1])

In [None]:
pd.DataFrame(gs.cv_results_)

In [None]:
gs.best_params_

In [None]:
gs.best_estimator_["clf"]

In [None]:
bst = gs.best_estimator_["clf"].get_booster()
importance = pd.Series(bst.get_score(importance_type="total_gain"))
# Restore feature names
if "dim_reducer" in gs.best_estimator_.named_steps:
    importance = importance.rename(
        {
            f"f{i}": one_hot_encoded_column_names[ci]
            for i, ci in enumerate(
                gs.best_estimator_["dim_reducer"].get_support(indices=True)
            )
        }
    )

fig, ax = plt.subplots()
importance.sort_values().plot.barh(ax=ax)
ax.set(title="Feature importance (total gain)")
pass