In [1]:
import pandas as pd

train_path = 'data/Housing Prices Competition/train.csv'
test_path = 'data/Housing Prices Competition/test.csv'
sample_submission_path = 'data/Housing Prices Competition/sample_submission.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
sample_submission_data = pd.read_csv(sample_submission_path)

target_variable = 'SalePrice'

X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.to_list()

for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')
    
    
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=1000,          # Количество итераций
    learning_rate=0.1,        # Скорость обучения
    depth=6,                  # Глубина деревьев
    cat_features=categorical_features,  # Категориальные признаки
    verbose=100               # Показывать процесс обучения каждые 100 итераций
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")

0:	learn: 72182.6596632	total: 111ms	remaining: 1m 51s
100:	learn: 16284.4692477	total: 1.53s	remaining: 13.7s
200:	learn: 11780.9827710	total: 3.07s	remaining: 12.2s
300:	learn: 9605.4763388	total: 4.53s	remaining: 10.5s
400:	learn: 7916.0581708	total: 6.08s	remaining: 9.08s
500:	learn: 6760.4991409	total: 7.46s	remaining: 7.43s
600:	learn: 5842.7157361	total: 8.77s	remaining: 5.82s
700:	learn: 5093.2345437	total: 10.3s	remaining: 4.37s
800:	learn: 4504.0141400	total: 11.6s	remaining: 2.89s
900:	learn: 3958.5655940	total: 12.9s	remaining: 1.42s
999:	learn: 3472.9254371	total: 14.1s	remaining: 0us
RMSE: 25513.20785721853


In [7]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

train_path = 'data/Housing Prices Competition/train.csv'
test_path = 'data/Housing Prices Competition/test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

target_variable = 'SalePrice'
X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]

# Разделение данных
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Заполнение пропусков
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_val[col] = X_val[col].fillna('Unknown')

# Преобразование имен признаков в список
feature_names = list(X_train.columns)

# Создание Pool с явным указанием feature_names
train_pool = Pool(X_train, y_train, cat_features=categorical_features, feature_names=feature_names)
val_pool = Pool(X_val, y_val, cat_features=categorical_features, feature_names=feature_names)

# Создание и обучение модели
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    verbose=100
)
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

CatBoostError: feature names should be a sequence, but got Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [5]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

train_path = 'data/Housing Prices Competition/train.csv'
test_path = 'data/Housing Prices Competition/test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

target_variable = 'SalePrice'
X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]

# Разделение данных
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Заполнение пропусков
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_val[col] = X_val[col].fillna('Unknown')

# Подготовка данных
train_pool = Pool(X_train, label=y_train, cat_features=categorical_features)
val_pool = Pool(X_val, label=y_val, cat_features=categorical_features)

# Список гиперпараметров для перебора
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

# Поиск лучших параметров
best_params = None
best_score = float('inf')

for iterations in param_grid['iterations']:
    for learning_rate in param_grid['learning_rate']:
        for depth in param_grid['depth']:
            for l2_leaf_reg in param_grid['l2_leaf_reg']:
                params = {
                    'iterations': iterations,
                    'learning_rate': learning_rate,
                    'depth': depth,
                    'l2_leaf_reg': l2_leaf_reg,
                    'verbose': 0
                }
                model = CatBoostRegressor(**params)
                model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)
                
                # Оценка модели
                y_pred = model.predict(val_pool)
                mse = mean_squared_error(y_val, y_pred)
                
                print(f"Params: {params}, MSE: {mse}")
                
                if mse < best_score:
                    best_score = mse
                    best_params = params

print("Лучшие параметры:", best_params)


CatBoostError: feature names should be a sequence, but got Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [3]:
train_path = 'data/Housing Prices Competition/train.csv'
test_path = 'data/Housing Prices Competition/test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

target_variable = 'SalePrice'
X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]

# Разделение данных
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Заполнение пропусков
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_val[col] = X_val[col].fillna('Unknown')
    
# Определение модели
model = CatBoostRegressor(cat_features=categorical_features, verbose=0)

# Сетка гиперпараметров
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

from sklearn.model_selection import GridSearchCV
# GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=3
)

# Поиск лучших параметров
grid_search.fit(X_train, y_train)
print("Лучшие параметры:", grid_search.best_params_)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100)


Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV 1/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=nan total time=   0.0s
[CV 2/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=nan total time=   0.0s
[CV 3/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=nan total time=   0.0s
[CV 1/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.05;, score=nan total time=   0.0s
[CV 2/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.05;, score=nan total time=   0.0s
[CV 3/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.05;, score=nan total time=   0.0s
[CV 1/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.1;, score=nan total time=   0.0s
[CV 2/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.1;, score=nan total time=   0.0s
[CV 3/3] END depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.1;, score=nan total time=   0.0s
[CV 1/3] END d

ValueError: 
All the 162 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/catboost/core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/catboost/core.py", line 2275, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, graph,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/catboost/core.py", line 1513, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, graph=graph, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/catboost/core.py", line 855, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/catboost/core.py", line 1438, in _init
    cat_features = _get_features_indices(cat_features, feature_names)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/repos/jupiter/venv/lib/python3.12/site-packages/catboost/core.py", line 316, in _get_features_indices
    raise CatBoostError("feature names should be a sequence, but got " + repr(features))
_catboost.CatBoostError: feature names should be a sequence, but got Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [None]:
# Заполнение пропусков в тестовых данных
for col in categorical_features:
    test_data[col] = test_data[col].fillna('Unknown')

# Предсказания
predictions = final_model.predict(test_data)

# Сохранение в файл CSV
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': predictions
})
submission.to_csv('submission.csv', index=False)
print("Файл submission.csv сохранен.")

In [2]:
import numpy as np

train_data_gb = pd.get_dummies(train_data)

columns = train_data_gb.columns.to_list()
has_nulls = train_data_gb.isnull().any().to_list()

for col, has_null in zip(columns, has_nulls):
    if has_null:
        if '_' in col:
            train_data_gb[col].fillna(train_data_gb[col].mode(), inplace=True)
        else:
            train_data_gb[col].fillna(train_data_gb[col].mean(), inplace=True)

correlation_matrix = train_data_gb.corr()

threshold = 0.2
target_variable = 'SalePrice'  # Замените на ваше название целевой переменной

high_correlation_features = correlation_matrix[abs(correlation_matrix[target_variable]) > threshold]
high_correlation_features = high_correlation_features[high_correlation_features.index != target_variable] #исключаем саму целевую переменную

features = ['BsmtUnfSF', 'TotRmsAbvGrd', 'YearRemodAdd', 'LotFrontage', 'FullBath', 'YearBuilt', 'LotArea', 
'GarageArea', 'GarageCars', '1stFlrSF', 'BsmtFinSF1', '2ndFlrSF', 'TotalBsmtSF', 'GrLivArea', 'OverallQual', 
'Fireplaces', 'OpenPorchSF', 'GarageYrBlt', 'MasVnrArea', 'WoodDeckSF', 'HalfBath']

features_with_important = list(set(features + high_correlation_features.index.to_list()))

features_for_model = features_with_important
X_gb = train_data_gb[features_for_model]
y_gb = train_data_gb[target_variable]

X_train_gb, X_test_gb, y_train_gb, y_test_gb = train_test_split(X_gb, y_gb, test_size=0.2, random_state=42)

y_log_gb = np.log1p(train_data_gb.SalePrice)

y_gb = np.expm1(y_log_gb)

from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gb.fit(X_train_gb, y_train_gb)

y_pred_gb_train = gb.predict(X_train_gb)
y_pred_gb_test = gb.predict(X_test_gb)

mse_train_gb = mean_squared_error(y_train_gb, y_pred_gb_train)
mse_test_gb = mean_squared_error(y_test_gb, y_pred_gb_test)

print(f"Train RMSE: {np.sqrt(mse_train_gb)}")
print(f"Test RMSE: {np.sqrt(mse_test_gb)}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data_gb[col].fillna(train_data_gb[col].mean(), inplace=True)


Train RMSE: 14935.732159268142
Test RMSE: 29063.776885532312
