In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import probplot
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.linear_model  import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import KNNImputer

In [2]:
train_path = 'data/Housing Prices Competition/train.csv'
home_data = pd.read_csv(train_path)
y = home_data.SalePrice

In [4]:
# Create X (After completing the exercise, you can return to modify this line!)
features = ['BsmtUnfSF', 'TotRmsAbvGrd', 'YearRemodAdd', 'LotFrontage', 'FullBath', 'YearBuilt', 'LotArea', 
'GarageArea', 'GarageCars', '1stFlrSF', 'BsmtFinSF1', '2ndFlrSF', 'TotalBsmtSF', 'GrLivArea', 'OverallQual', 
'Fireplaces', 'OpenPorchSF', 'GarageYrBlt', 'MasVnrArea', 'WoodDeckSF', 'HalfBath']

X = home_data[features]
X.head()

object_columns = home_data.select_dtypes(include=['object'])
object_columns['MSSubClass'] = home_data['MSSubClass']
obj_cols = object_columns.drop(columns=['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])

imputer = KNNImputer(n_neighbors=5)

obj_imputer = SimpleImputer(strategy='most_frequent')

obj_cols_imputed = obj_imputer.fit_transform(obj_cols)

obj_cols = pd.DataFrame(obj_cols_imputed, columns=obj_cols.columns)

X_imputed = imputer.fit_transform(X)

X = pd.DataFrame(X_imputed, columns=features)

merge_x_obj_cols = pd.concat([X, obj_cols.reset_index(drop=True)], axis=1)

In [5]:
# Применяем One-Hot Encoding для категориальных колонок
X = pd.get_dummies(merge_x_obj_cols)

y_log = np.log1p(home_data.SalePrice)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2)

y = np.expm1(y_log)

lr = LinearRegression()
lr.fit(X_train,  y_train)

clf = RandomForestRegressor(n_estimators=100,  max_features=0.3)
clf.fit(X_train, y_train)

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gb.fit(X_train, y_train)

# Предсказания моделей в логарифмированной шкале
lr_predictions_log = lr.predict(X_test)
clf_predictions_log = clf.predict(X_test)

min_mse = mean_squared_error(y_test, lr.predict(X_test))
for i in range(100):
    mse = mean_squared_error(y_test, (i*lr.predict(X_test) + (100 - i)*clf.predict(X_test))/100.0)
    if mse < min_mse:
        min_mse = mse
        best_i = i

# Комбинирование предсказаний (в логарифмированной шкале)
combined_predictions_log = (best_i * lr_predictions_log + (100 - best_i) * clf_predictions_log) / 100.0

# Обратное преобразование предсказаний (возврат к исходному масштабу)
combined_predictions = np.exp(combined_predictions_log)

# Вычисление MSE в исходном масштабе
mse = mean_squared_error(y_test, combined_predictions)
print(f"MSE после обратного преобразования: {mse}")

MSE после обратного преобразования: 36961789664.383934
