In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [2]:
import catboost
from catboost import *

import numpy as np
import pandas as pd

In [36]:
train_dataset = pd.read_csv('/content/train.csv')
test_dataset = pd.read_csv('/content/test.csv')

test_ids = test_dataset['Id']

train_dataset.drop(columns='Id', inplace=True)
test_dataset.drop(columns='Id', inplace=True)

In [37]:
X = train_dataset.loc[:, train_dataset.columns.drop('SalePrice')]
Y = train_dataset.loc[:, 'SalePrice']
X.shape, Y.shape

((1460, 79), (1460,))

In [38]:
numerical_features_ids = np.where(X.dtypes != 'object')[0]
categorical_features_ids = np.where(X.dtypes == 'object')[0]
len(numerical_features_ids), len(categorical_features_ids)

(36, 43)

In [39]:
null_value_stats = X.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [40]:
fillna_values = {}
for i in range(len(X.columns)):
    if i in categorical_features_ids:
        fillna_values[X.columns[i]] = '-9999'
    else:
        fillna_values[X.columns[i]] = -9999

In [41]:
X.fillna(fillna_values, inplace=True)
test_dataset.fillna(fillna_values, inplace=True)

In [42]:
null_value_stats = X.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Series([], dtype: int64)

In [43]:
null_value_stats = test_dataset.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Series([], dtype: int64)

In [46]:
train_pool = Pool(data=X, label=Y, cat_features=categorical_features_ids)
test_pool = Pool(data=test_dataset, cat_features=categorical_features_ids)

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, train_size=0.8, random_state=1234)

In [48]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.3,
    early_stopping_rounds=100,
    eval_metric='MSLE'
)
model.fit(
    X_train, y_train,
    cat_features=categorical_features_ids,
    eval_set=(X_validation, y_validation),
    verbose=True,
    plot=False
)

0:	learn: 0.1076193	test: 0.1300566	best: 0.1300566 (0)	total: 33.3ms	remaining: 33.3s
1:	learn: 0.0769245	test: 0.0988455	best: 0.0988455 (1)	total: 57.7ms	remaining: 28.8s
2:	learn: 0.0588258	test: 0.0795822	best: 0.0795822 (2)	total: 82.3ms	remaining: 27.3s
3:	learn: 0.0457882	test: 0.0654548	best: 0.0654548 (3)	total: 104ms	remaining: 25.8s
4:	learn: 0.0385799	test: 0.0583046	best: 0.0583046 (4)	total: 127ms	remaining: 25.2s
5:	learn: 0.0329835	test: 0.0520243	best: 0.0520243 (5)	total: 150ms	remaining: 24.9s
6:	learn: 0.0298808	test: 0.0480195	best: 0.0480195 (6)	total: 175ms	remaining: 24.8s
7:	learn: 0.0269425	test: 0.0449519	best: 0.0449519 (7)	total: 201ms	remaining: 24.9s
8:	learn: 0.0243955	test: 0.0415916	best: 0.0415916 (8)	total: 225ms	remaining: 24.8s
9:	learn: 0.0224364	test: 0.0382603	best: 0.0382603 (9)	total: 258ms	remaining: 25.5s
10:	learn: 0.0217830	test: 0.0375816	best: 0.0375816 (10)	total: 281ms	remaining: 25.3s
11:	learn: 0.0199712	test: 0.0350125	best: 0.0350

<catboost.core.CatBoostRegressor at 0x7ca8165e0790>

In [57]:
best_model = CatBoostRegressor(
    iterations=int(model.tree_count_ * 1),
    learning_rate=0.3,
    eval_metric='MSLE'
)
best_model.fit(
    train_pool,
    verbose=100
)

0:	learn: 0.1114886	total: 10.8ms	remaining: 2.03s
100:	learn: 0.0052751	total: 932ms	remaining: 821ms
189:	learn: 0.0026431	total: 1.74s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7ca8038fa7a0>

In [58]:
best_model.tree_count_

190

In [59]:
Y_pred = best_model.predict(test_pool)
print('Predictoins:')
print(Y_pred)

Predictoins:
[117607.44348596 156970.25134855 185240.10542912 ... 169060.39970464
 112166.50628668 223562.33635485]


In [60]:
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['SalePrice'] = Y_pred

In [61]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,117607.443486
1,1462,156970.251349
2,1463,185240.105429
3,1464,196496.915797
4,1465,190661.610962


In [62]:
submission.to_csv('/content/catboost.csv', index=False)