# Catboost Yandex

In [21]:
import pandas as pd

train_path = 'data/Housing Prices Competition/train.csv'
test_path = 'data/Housing Prices Competition/test.csv'
sample_submission_path = 'data/Housing Prices Competition/sample_submission.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
sample_submission_data = pd.read_csv(sample_submission_path)


target_variable = 'SalePrice'

X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.to_list()

for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')


from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=1500,                        # Количество итераций
    learning_rate=0.1,                      # Скорость обучения
    depth=6,                                # Глубина деревьев
    cat_features=categorical_features,      # Категориальные признаки
    verbose=100                             # Показывать процесс обучения каждые 100 итераций
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

0:	learn: 72182.6596632	total: 16.2ms	remaining: 24.3s
100:	learn: 16284.4692477	total: 1.56s	remaining: 21.6s
200:	learn: 11780.9827710	total: 3.12s	remaining: 20.1s
300:	learn: 9605.4763388	total: 4.82s	remaining: 19.2s
400:	learn: 7916.0581708	total: 6.49s	remaining: 17.8s
500:	learn: 6760.4991409	total: 8.06s	remaining: 16.1s
600:	learn: 5842.7157361	total: 9.49s	remaining: 14.2s
700:	learn: 5093.2345437	total: 11s	remaining: 12.5s
800:	learn: 4504.0141400	total: 12.4s	remaining: 10.8s
900:	learn: 3958.5655940	total: 13.9s	remaining: 9.24s
1000:	learn: 3467.3857194	total: 15.4s	remaining: 7.66s
1100:	learn: 3100.9401594	total: 16.9s	remaining: 6.12s
1200:	learn: 2802.9068778	total: 18.4s	remaining: 4.59s
1300:	learn: 2505.0245218	total: 20s	remaining: 3.06s
1400:	learn: 2242.6364330	total: 21.5s	remaining: 1.52s
1499:	learn: 2035.1521272	total: 23s	remaining: 0us
RMSE: 25463.36708549864


In [22]:
for col in categorical_features:
    test_data[col] = test_data[col].fillna('Unknown')

test_pred = model.predict(test_data)

submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_pred
})

submission.to_csv('pred_catboost.csv', index=False)

print(submission.head())
print(submission.info())

     Id      SalePrice
0  1461  121663.780546
1  1462  169497.574258
2  1463  192815.845409
3  1464  199556.252914
4  1465  191372.435714
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1459 non-null   int64  
 1   SalePrice  1459 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 22.9 KB
None


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

# Загрузка данных
train_path = 'data/Housing Prices Competition/train.csv'
test_path = 'data/Housing Prices Competition/test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Целевая переменная и признаки
target_variable = 'SalePrice'
X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]

# Разделение на тренировочные и тестовые данные
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обработка категориальных данных
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.to_list()

for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

# Создание объектов Pool
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)

# Создание и обучение модели
model = CatBoostRegressor(
    iterations=2000,                        # Количество итераций
    learning_rate=0.1,                      # Скорость обучения
    depth=6,                                # Глубина деревьев
    eval_metric='RMSE',                     # Метрика для оценки
    verbose=100                             # Показывать процесс обучения каждые 100 итераций
)

model.fit(train_pool, eval_set=test_pool) # , early_stopping_rounds=50

# Предсказания
y_pred = model.predict(test_pool)

# Оценка модели
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")


0:	learn: 72182.6596632	test: 82377.1308432	best: 82377.1308432 (0)	total: 11ms	remaining: 22s
100:	learn: 16284.4692477	test: 26807.8758392	best: 26807.8758392 (100)	total: 1.49s	remaining: 28.1s
200:	learn: 11780.9827710	test: 25887.6359189	best: 25874.9486234 (197)	total: 2.84s	remaining: 25.5s
300:	learn: 9605.4763388	test: 25685.0681388	best: 25644.1716705 (289)	total: 4.21s	remaining: 23.8s
400:	learn: 7916.0581708	test: 25536.3671255	best: 25522.5705186 (397)	total: 5.61s	remaining: 22.4s
500:	learn: 6760.4991409	test: 25537.9824591	best: 25484.4424441 (470)	total: 6.97s	remaining: 20.8s
600:	learn: 5842.7157361	test: 25565.7213127	best: 25484.4424441 (470)	total: 8.38s	remaining: 19.5s
700:	learn: 5093.2345437	test: 25594.2788533	best: 25484.4424441 (470)	total: 9.75s	remaining: 18.1s
800:	learn: 4504.0141400	test: 25548.1621751	best: 25484.4424441 (470)	total: 11.1s	remaining: 16.6s
900:	learn: 3958.5655940	test: 25523.2463559	best: 25484.4424441 (470)	total: 12.4s	remaining: 

In [19]:
for col in categorical_features:
    test_data[col] = test_data[col].fillna('Unknown')

test_pred = model.predict(test_data)

submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_pred
})

submission.to_csv('pred_catboost.csv', index=False)

print(submission.head())
print(submission.info())

     Id      SalePrice
0  1461  121755.738844
1  1462  169499.878899
2  1463  191904.849352
3  1464  199268.412985
4  1465  190920.186945
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1459 non-null   int64  
 1   SalePrice  1459 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 22.9 KB
None


In [26]:
import numpy as np

train_data_gb = pd.get_dummies(train_data)

columns = train_data_gb.columns.to_list()
has_nulls = train_data_gb.isnull().any().to_list()

for col, has_null in zip(columns, has_nulls):
    if has_null:
        if '_' in col:
            train_data_gb[col].fillna(train_data_gb[col].mode(), inplace=True)
        else:
            train_data_gb[col].fillna(train_data_gb[col].mean(), inplace=True)

correlation_matrix = train_data_gb.corr()

threshold = 0.2
target_variable = 'SalePrice'  # Замените на ваше название целевой переменной

high_correlation_features = correlation_matrix[abs(correlation_matrix[target_variable]) > threshold]
high_correlation_features = high_correlation_features[high_correlation_features.index != target_variable] #исключаем саму целевую переменную

features = ['BsmtUnfSF', 'TotRmsAbvGrd', 'YearRemodAdd', 'LotFrontage', 'FullBath', 'YearBuilt', 'LotArea', 
'GarageArea', 'GarageCars', '1stFlrSF', 'BsmtFinSF1', '2ndFlrSF', 'TotalBsmtSF', 'GrLivArea', 'OverallQual', 
'Fireplaces', 'OpenPorchSF', 'GarageYrBlt', 'MasVnrArea', 'WoodDeckSF', 'HalfBath']

features_with_important = list(set(features + high_correlation_features.index.to_list()))

features_for_model = features_with_important
X_gb = train_data_gb[features_for_model]
y_gb = train_data_gb[target_variable]

X_train_gb, X_test_gb, y_train_gb, y_test_gb = train_test_split(X_gb, y_gb, test_size=0.2, random_state=42)

y_log_gb = np.log1p(train_data_gb.SalePrice)

y_gb = np.expm1(y_log_gb)

from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gb.fit(X_train_gb, y_train_gb)

y_pred_gb_train = gb.predict(X_train_gb)
y_pred_gb_test = gb.predict(X_test_gb)

mse_train_gb = mean_squared_error(y_train_gb, y_pred_gb_train)
mse_test_gb = mean_squared_error(y_test_gb, y_pred_gb_test)

print(f"Train RMSE: {np.sqrt(mse_train_gb)}")
print(f"Test RMSE: {np.sqrt(mse_test_gb)}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data_gb[col].fillna(train_data_gb[col].mean(), inplace=True)


Train RMSE: 14935.732159268142
Test RMSE: 28679.201959367612


In [None]:
from catboost import Pool
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

# Создание Pool для CatBoost
# train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
# test_pool = Pool(data=X_test, cat_features=categorical_features)

# Обновление стекинга
stacking_model = StackingRegressor(
    estimators=[
        ('catboost', model),
        ('gradient_boosting_regressor', gb)
        # ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42)),
        # ('linear', LinearRegression())
    ],
    final_estimator=LinearRegression()
)

# Передача остальных данных в стекинг
stacking_model.fit(X_train, y_train)

# Итоговые предсказания
final_predictions = stacking_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

In [23]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Базовые модели
model_random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
model_linear = LinearRegression()

# Стекинг с использованием преобразованных данных
stacking_model = StackingRegressor(
    estimators=[
        ('catboost', model),
        ('random_forest', model_random_forest),
        ('linear', model_linear)
    ],
    final_estimator=LinearRegression()
)

# Обучение стекинга
stacking_model.fit(transformed_X_train, y_train)

# Предсказания стекинга
final_predictions = stacking_model.predict(transformed_X_test)

print("Предсказания стекинга:", final_predictions[:5])

CatBoostError: 'data' is numpy array of floating point numerical type, it means no categorical features, but 'cat_features' parameter specifies nonzero number of categorical features

In [None]:
from catboost import Pool, CatBoostRegressor
import pandas as pd

# Создание объектов Pool
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, cat_features=categorical_features)

# Создание и обучение модели
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    cat_features=categorical_features,
    verbose=100
)
model.fit(train_pool)

# Получение "преобразованных" данных с помощью predict
transformed_X_train = model.predict(train_pool, prediction_type='RawFormulaVal')
transformed_X_test = model.predict(test_pool, prediction_type='RawFormulaVal')

# Преобразованные данные в виде DataFrame
transformed_X_train_df = pd.DataFrame(
    transformed_X_train,
    columns=[f"feature_{i}" for i in range(transformed_X_train.shape[1])]
)
transformed_X_test_df = pd.DataFrame(
    transformed_X_test,
    columns=[f"feature_{i}" for i in range(transformed_X_test.shape[1])]
)

print("Первые строки преобразованных данных:")
print(transformed_X_train_df.head())


# from sklearn.ensemble import StackingRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from catboost import CatBoostRegressor

# # Базовые модели
# model_catboost = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, cat_features=categorical_features, verbose=0)
# model_random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
# model_linear = LinearRegression()

# # Стекинг
# stacking_model = StackingRegressor(
#     estimators=[
#         ('catboost', model_catboost),
#         ('random_forest', model_random_forest),
#         ('linear', model_linear)
#     ],
#     final_estimator=LinearRegression()
# )

# # Обучение
# stacking_model.fit(X_train, y_train)

0:	learn: 74687.8203090	total: 26.6ms	remaining: 13.3s
100:	learn: 21631.0438606	total: 1.41s	remaining: 5.55s
200:	learn: 16677.1183424	total: 2.82s	remaining: 4.19s
300:	learn: 14011.3145261	total: 4.31s	remaining: 2.85s
400:	learn: 11966.7917048	total: 5.77s	remaining: 1.42s
499:	learn: 10553.6423047	total: 7.39s	remaining: 0us


IndexError: tuple index out of range

In [18]:
from catboost import Pool, CatBoostRegressor
import pandas as pd

# Создание объектов Pool
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, cat_features=categorical_features)

# Создание и обучение модели
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    cat_features=categorical_features,
    verbose=100
)
model.fit(train_pool)

# Получение "преобразованных" данных с помощью predict
transformed_X_train = model.predict(train_pool, prediction_type='RawFormulaVal')
transformed_X_test = model.predict(test_pool, prediction_type='RawFormulaVal')

# Преобразованные данные в виде DataFrame
transformed_X_train_df = pd.DataFrame(
    transformed_X_train,
    columns=[f"feature_{i}" for i in range(transformed_X_train.shape[1])]
)
transformed_X_test_df = pd.DataFrame(
    transformed_X_test,
    columns=[f"feature_{i}" for i in range(transformed_X_test.shape[1])]
)

print("Первые строки преобразованных данных:")
print(transformed_X_train_df.head())


# from sklearn.ensemble import StackingRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from catboost import CatBoostRegressor

# # Базовые модели
# model_catboost = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, cat_features=categorical_features, verbose=0)
# model_random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
# model_linear = LinearRegression()

# # Стекинг
# stacking_model = StackingRegressor(
#     estimators=[
#         ('catboost', model_catboost),
#         ('random_forest', model_random_forest),
#         ('linear', model_linear)
#     ],
#     final_estimator=LinearRegression()
# )

# # Обучение
# stacking_model.fit(X_train, y_train)

0:	learn: 74687.8203090	total: 26.6ms	remaining: 13.3s
100:	learn: 21631.0438606	total: 1.41s	remaining: 5.55s
200:	learn: 16677.1183424	total: 2.82s	remaining: 4.19s
300:	learn: 14011.3145261	total: 4.31s	remaining: 2.85s
400:	learn: 11966.7917048	total: 5.77s	remaining: 1.42s
499:	learn: 10553.6423047	total: 7.39s	remaining: 0us


IndexError: tuple index out of range