## Testing CatBoost algoritm

Notice that I am using a different virtual environment to avoid numpy conflicts.

In [32]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor

In [33]:
train = pd.read_csv('../data/train.gz', index_col="index")
test = pd.read_csv('../data/test.gz', index_col="index")

In [34]:
train = train.dropna()

In [40]:
categorical_features = ['chassisType', 'color', 'driveWheels', 'fuelType', 'manufacturer', 'model']

# Initialize Pool objects
train_pool = Pool(data=train.drop(columns=['asking_price']),
                  label=train['asking_price'],
                  cat_features=categorical_features)

test_pool = Pool(data=test,
                 cat_features=categorical_features)

# Custom loss function for
def custom_mape(y_true, y_pred):
    diff = np.abs((y_true - y_pred) / y_true)
    return np.mean(diff) * 100

# Train a CatBoostRegressor model
model = CatBoostRegressor(iterations=250,
                          depth=6,
                          learning_rate=0.05,
                          loss_function='RMSE',
                          verbose=10)

model.fit(train_pool)


0:	learn: 125390.8983991	total: 46.5ms	remaining: 11.6s
10:	learn: 91273.1957117	total: 574ms	remaining: 12.5s
20:	learn: 72288.0983057	total: 1.19s	remaining: 12.9s
30:	learn: 61478.0454295	total: 1.72s	remaining: 12.1s
40:	learn: 55446.9876507	total: 2.24s	remaining: 11.4s
50:	learn: 51734.3211032	total: 2.75s	remaining: 10.7s
60:	learn: 49147.1486358	total: 3.31s	remaining: 10.3s
70:	learn: 47072.4633672	total: 3.83s	remaining: 9.65s
80:	learn: 45743.4856479	total: 4.35s	remaining: 9.07s
90:	learn: 44720.2054974	total: 4.87s	remaining: 8.52s
100:	learn: 43670.7542447	total: 5.45s	remaining: 8.05s
110:	learn: 43084.8528493	total: 5.98s	remaining: 7.48s
120:	learn: 42443.7693821	total: 6.47s	remaining: 6.9s
130:	learn: 42034.6413183	total: 6.95s	remaining: 6.32s
140:	learn: 41794.5048189	total: 7.47s	remaining: 5.78s
150:	learn: 41302.4967433	total: 7.96s	remaining: 5.22s
160:	learn: 40980.6073257	total: 8.47s	remaining: 4.68s
170:	learn: 40643.7961557	total: 8.93s	remaining: 4.12s
18

<catboost.core.CatBoostRegressor at 0x1f1e2e7f2b0>

In [41]:
train_labels = train['asking_price']
test_labels = test['asking_price']

y_train_pred = model.predict(train_pool)
y_test_pred = model.predict(test_pool)

In [43]:
# Calculate MAPE* (Median Absolute Percentage Error)
train_mape = np.mean(np.abs((train_labels - y_train_pred) / train_labels)) * 100
test_mape = np.mean(np.abs((test_labels - y_test_pred) / test_labels)) * 100

train_mape_ = np.median(np.abs((train_labels - y_train_pred) / train_labels)) * 100
test_mape_ = np.median(np.abs((test_labels - y_test_pred) / test_labels)) * 100

print(f"Training MAPE*: {train_mape_:.2f}%")
print(f"Validation MAPE*: {test_mape_:.2f}%")

print(f"Training MAPE: {train_mape:.2f}%")
print(f"Validation MAPE: {test_mape:.2f}%")

Training MAPE*: 18.20%
Validation MAPE*: 18.44%
Training MAPE: 31.99%
Validation MAPE: 34.44%
