# CatBoost Example

Visualizing the learning curves.

Author: https://github.com/deburky

## Synthetic example

In [3]:
from catboost import Pool, cv, datasets

# Load example data
train_df, test_df = datasets.titanic()
train_df = train_df.dropna(subset=["Pclass", "Age", "Fare", "Survived"])

train = Pool(train_df[["Pclass", "Age", "Fare"]], label=train_df["Survived"])

# Define model params (must include loss_function)
params = {
    "iterations": 50,
    "learning_rate": 0.1,
    "depth": 3,
    "loss_function": "Logloss",
    "verbose": False,
}

# Perform cross-validation with plotting
cv_data = cv(params=params, pool=train, fold_count=3, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 0.5586036909
bestIteration = 47

Training on fold [1/3]

bestTest = 0.5836710212
bestIteration = 25

Training on fold [2/3]

bestTest = 0.5646608486
bestIteration = 43



## Real world example

In [4]:
from pathlib import Path
import pandas as pd

# Set the path to the data directory
data_dir = Path.cwd().parent / "data"

# Load the data
dataset = pd.read_csv(data_dir / "BankCaseStudyData.csv")

# Prepare features and labels
label = "Final_Decision"
dataset[label] = dataset[label].map({"Accept": 0, "Decline": 1})

num_features = [
    "Application_Score",
    "Bureau_Score",
    "Loan_Amount",
    "Time_with_Bank",
    "Time_in_Employment",
    "Loan_to_income",
    "Gross_Annual_Income",
]

cat_features = [
    "Loan_Payment_Frequency",
    "Residential_Status",
    "Cheque_Card_Flag",
    "Existing_Customer_Flag",
    "Home_Telephone_Number",
]

features = cat_features + num_features

ix_train = dataset["split"] == "Development"
ix_test = dataset["split"] == "Validation"

X_train = dataset.loc[ix_train, features]
y_train = dataset.loc[ix_train, label]
X_test = dataset.loc[ix_test, features]
y_test = dataset.loc[ix_test, label]

X_train.loc[:, cat_features] = X_train.loc[:, cat_features].astype(str).fillna("NA")
X_test.loc[:, cat_features] = X_test.loc[:, cat_features].astype(str).fillna("NA")

In [5]:
import catboost as cb

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features)
test_pool = cb.Pool(X_test, y_test, cat_features=cat_features)

cb_model = cb.CatBoostClassifier(
    random_state=42,
    loss_function="Logloss",
    eval_metric="Logloss",
    verbose=100,
    cat_features=cat_features,
    allow_writing_files=True,
    od_type="Iter",
    od_wait=100,
)

cb_model.fit(train_pool, eval_set=test_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.066322
0:	learn: 0.5952337	test: 0.5954666	best: 0.5954666 (0)	total: 12.4ms	remaining: 12.4s
100:	learn: 0.1555173	test: 0.1752337	best: 0.1752337 (100)	total: 964ms	remaining: 8.58s
200:	learn: 0.1426872	test: 0.1739138	best: 0.1737967 (168)	total: 1.7s	remaining: 6.75s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.17379668
bestIteration = 168

Shrink model to first 169 iterations.


<catboost.core.CatBoostClassifier at 0x12ab86710>