In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Загрузка данных
train_path = 'data/Housing Prices Competition/train.csv'
train_data = pd.read_csv(train_path)

# Целевая переменная и признаки
target_variable = 'SalePrice'
X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]

# Разделение на тренировочные и тестовые данные
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обработка данных для CatBoost
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.to_list()
for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

# Обработка данных для Gradient Boosting
def preprocess_data_for_gb(data):
    data = pd.get_dummies(data)
    
    columns = data.columns.to_list()
    has_nulls = data.isnull().any().to_list()
    
    for col, has_null in zip(columns, has_nulls):
        if has_null:
            if '_' in col:
                data[col].fillna(data[col].mode(), inplace=True)
            else:
                data[col].fillna(data[col].mean(), inplace=True)
    return data

def get_features_with_important_for_gb(data):
    correlation_matrix = data.corr()

    threshold = 0.2
    target_variable = 'SalePrice'

    high_correlation_features = correlation_matrix[abs(correlation_matrix[target_variable]) > threshold]
    high_correlation_features = high_correlation_features[high_correlation_features.index != target_variable]
    
    features = ['BsmtUnfSF', 'TotRmsAbvGrd', 'YearRemodAdd', 'LotFrontage', 'FullBath', 'YearBuilt', 'LotArea', 
    'GarageArea', 'GarageCars', '1stFlrSF', 'BsmtFinSF1', '2ndFlrSF', 'TotalBsmtSF', 'GrLivArea', 'OverallQual', 
    'Fireplaces', 'OpenPorchSF', 'GarageYrBlt', 'MasVnrArea', 'WoodDeckSF', 'HalfBath']
    
    features_with_important = list(set(features + high_correlation_features.index.to_list()))
    return features_with_important

def get_split(data, features):
    features_for_model = features
    X = data[features_for_model]
    y = train_data[target_variable]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    y_log = np.log1p(train_data.SalePrice)

    y = np.expm1(y_log)
    
    return X_train, X_test, y_train, y_test, y_log, y

# X_train_gb = preprocess_data_for_gb(X_train)
# X_test_gb = preprocess_data_for_gb(X_test)

data_gb = preprocess_data_for_gb(train_data)
features_gb = get_features_with_important_for_gb(data_gb)
(X_train_gb, X_test_gb, y_train_gb, y_test_gb, y_log_gb, y_gb) = get_split(data_gb, features_gb)

# X_train

# Gradient Boosting
# gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gradient_boosting_model = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

print("Обучение Gradient Boosting...")
gradient_boosting_model.fit(X_train_gb, y_train)



# CatBoost
catboost_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    verbose=100,
    cat_features=list(categorical_features)
)


# Обучение базовых моделей
print("Обучение CatBoost...")
catboost_model.fit(X_train, y_train)

# Предсказания базовых моделей на тестовых данных
catboost_pred = catboost_model.predict(X_test)
gradient_boosting_pred = gradient_boosting_model.predict(X_test_gb)

# Создание метапризнаков (объединение предсказаний)
meta_features_train = np.column_stack([
    catboost_model.predict(X_train),               # Предсказания CatBoost на тренировочных данных
    gradient_boosting_model.predict(X_train_gb)    # Предсказания Gradient Boosting на тренировочных данных
])

meta_features_test = np.column_stack([
    catboost_pred,                                 # Предсказания CatBoost на тестовых данных
    gradient_boosting_pred                         # Предсказания Gradient Boosting на тестовых данных
])


from sklearn.linear_model import LinearRegression

# Метамодель
meta_model = LinearRegression()

# Обучение метамодели
print("Обучение метамодели...")
meta_model.fit(meta_features_train, y_train)

# Предсказания метамодели
final_predictions = meta_model.predict(meta_features_test)


# Оценка качества модели
mse = mean_squared_error(y_test, final_predictions)
rmse = mse ** 0.5
print(f"RMSE для стекинга: {rmse}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


Обучение Gradient Boosting...
Обучение CatBoost...
0:	learn: 72182.6596632	total: 13.8ms	remaining: 6.91s
100:	learn: 16284.4692477	total: 1.48s	remaining: 5.87s
200:	learn: 11780.9827710	total: 2.89s	remaining: 4.3s
300:	learn: 9605.4763388	total: 4.34s	remaining: 2.87s
400:	learn: 7916.0581708	total: 5.74s	remaining: 1.42s
499:	learn: 6770.1610563	total: 7.12s	remaining: 0us
Обучение метамодели...
RMSE для стекинга: 27319.13270393849


In [None]:
test_path = 'data/Housing Prices Competition/test.csv'
test_data = pd.read_csv(test_path)

for col in categorical_features:
    test_data[col] = test_data[col].fillna('Unknown')

test_pred = model.predict(test_data)

submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_pred
})

submission.to_csv('pred_catboost.csv', index=False)

print(submission.head())
print(submission.info())

In [3]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Пути к данным
train_path = 'data/Housing Prices Competition/train.csv'
test_path = 'data/Housing Prices Competition/test.csv'

# Загрузка данных
train_data = pd.read_csv(train_path)

# Целевая переменная и признаки
target_variable = 'SalePrice'
X = train_data.drop(columns=[target_variable])
y = train_data[target_variable]

# Разделение на тренировочные и тестовые данные
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Функция обработки данных для Gradient Boosting
def preprocess_data_for_gb(data):
    # One-hot encoding для категориальных признаков
    data = pd.get_dummies(data)

    # Заполнение пропусков
    columns = data.columns.to_list()
    has_nulls = data.isnull().any().to_list()

    for col, has_null in zip(columns, has_nulls):
        if has_null:
            if '_' in col:  # Если это one-hot-кодированный признак
                data[col].fillna(data[col].mode()[0], inplace=True)
            else:  # Если это числовой признак
                data[col].fillna(data[col].mean(), inplace=True)
    return data

# Gradient Boosting обработка
X_train_gb = preprocess_data_for_gb(X_train)
X_test_gb = preprocess_data_for_gb(X_test)

# CatBoost обработка категориальных данных
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.to_list()
for col in categorical_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

# CatBoost модель
catboost_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    verbose=100,
    cat_features=list(categorical_features)
)

# Gradient Boosting модель
gradient_boosting_model = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Обертки для обработки данных
catboost_transformer = FunctionTransformer(lambda X: X)  # Необработанные данные для CatBoost
gradient_boosting_transformer = FunctionTransformer(preprocess_data_for_gb)  # Обработанные данные для Gradient Boosting

# Стекинг с обработкой данных
stacking_model = StackingRegressor(
    estimators=[
        ('catboost', Pipeline([
            ('transform', catboost_transformer),
            ('model', catboost_model)
        ])),
        ('gradient_boosting', Pipeline([
            ('transform', gradient_boosting_transformer),
            ('model', gradient_boosting_model)
        ]))
    ],
    final_estimator=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
)

# Обучение стекинга
print("Обучение стекинга...")
stacking_model.fit(X_train, y_train)

# Предсказания
print("Предсказания стекинга...")
y_pred = stacking_model.predict(X_test)

# Оценка качества
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"RMSE для стекинга: {rmse}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


Обучение стекинга...
0:	learn: 72182.6596632	total: 17.2ms	remaining: 8.59s
100:	learn: 16284.4692477	total: 1.56s	remaining: 6.18s
200:	learn: 11780.9827710	total: 2.98s	remaining: 4.43s
300:	learn: 9605.4763388	total: 4.58s	remaining: 3.03s
400:	learn: 7916.0581708	total: 6.15s	remaining: 1.52s
499:	learn: 6770.1610563	total: 7.78s	remaining: 0us


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


0:	learn: 71426.6671766	total: 14.8ms	remaining: 7.36s
100:	learn: 16426.5061042	total: 1.61s	remaining: 6.37s
200:	learn: 11883.1293289	total: 3.03s	remaining: 4.51s
300:	learn: 9594.9970112	total: 4.38s	remaining: 2.89s
400:	learn: 7658.1430854	total: 5.75s	remaining: 1.42s
499:	learn: 6398.8552749	total: 7.12s	remaining: 0us
0:	learn: 73451.9481633	total: 13.3ms	remaining: 6.66s
100:	learn: 15642.9247526	total: 1.45s	remaining: 5.73s
200:	learn: 12090.2733266	total: 2.84s	remaining: 4.23s
300:	learn: 9291.4721687	total: 4.28s	remaining: 2.83s
400:	learn: 7682.8462856	total: 5.81s	remaining: 1.44s
499:	learn: 6455.3868136	total: 7.35s	remaining: 0us
0:	learn: 70138.6760800	total: 12.4ms	remaining: 6.2s
100:	learn: 16522.0031760	total: 1.49s	remaining: 5.89s
200:	learn: 11626.6637859	total: 3.05s	remaining: 4.54s
300:	learn: 8946.2552018	total: 4.54s	remaining: 3s
400:	learn: 7064.9192299	total: 6.09s	remaining: 1.5s
499:	learn: 5698.7419763	total: 7.46s	remaining: 0us
0:	learn: 73729

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Condition2_PosA
- Condition2_RRNn
- Exterior1st_ImStucc
Feature names seen at fit time, yet now missing:
- BsmtCond_Po
- Condition1_RRNe
- Condition2_Artery
- Condition2_PosN
- Condition2_RRAe
- ...
