In [1]:
import sklearn
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

## Датасет

In [2]:
df = pd.read_csv('Housing.csv')

In [3]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [6]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

## Кодирование категориальных признаков

In [7]:
import sklearn
import category_encoders as ce

In [8]:
bin_enc = ce.BinaryEncoder(drop_invariant=True)
df = bin_enc.fit_transform(df)

In [9]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad_0,mainroad_1,guestroom_0,guestroom_1,basement_0,basement_1,hotwaterheating_0,hotwaterheating_1,airconditioning_0,airconditioning_1,parking,prefarea_0,prefarea_1,furnishingstatus_0,furnishingstatus_1
0,13300000,7420,4,2,3,0,1,0,1,0,1,0,1,0,1,2,0,1,0,1
1,12250000,8960,4,4,4,0,1,0,1,0,1,0,1,0,1,3,1,0,0,1
2,12250000,9960,3,2,2,0,1,0,1,1,0,0,1,1,0,2,0,1,1,0
3,12215000,7500,4,2,2,0,1,0,1,1,0,0,1,0,1,3,0,1,0,1
4,11410000,7420,4,1,2,0,1,1,0,1,0,0,1,0,1,2,1,0,0,1


## Разделение выборки на обучающую и тестовую

In [10]:
from sklearn.model_selection import train_test_split

y = df["price"]
X = df.drop(["price"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size= 0.33, random_state=42)

## Случайный лес

In [11]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
prediction = rfr.predict(X_test)

In [12]:
params = {
    'max_depth' : [10, 15, 20],
}

grid = GridSearchCV(estimator=RandomForestRegressor(),
                    param_grid=params, 
                    cv=5)
grid.fit(X_train, y_train)

In [13]:
grid.best_score_, grid.best_params_

(0.5722652910127637, {'max_depth': 10})

In [14]:
rfr = RandomForestRegressor(max_depth=15, random_state=42)
rfr.fit(X_train, y_train)
prediction = rfr.predict(X_test)
r2_score(y_test, prediction)

0.5936798681743574

## Бустинг

In [15]:
params = {
    'n_estimators': [500, 800],
    'max_depth': [5, 8],
    'min_samples_split': [2, 5],
    'learning_rate': [0.01, 0.1]
}
grid = GridSearchCV(estimator=GradientBoostingRegressor(),
                    param_grid=params,
                    cv=5,
                    n_jobs=-1)
grid.fit(X_train, y_train)

In [16]:
grid.best_score_, grid.best_params_

(0.5739527909981744,
 {'learning_rate': 0.01,
  'max_depth': 5,
  'min_samples_split': 2,
  'n_estimators': 500})

In [17]:
gbr = GradientBoostingRegressor(**grid.best_params_)
gbr.fit(X_train, y_train)
prediction = gbr.predict(X_test)
r2_score(y_test, prediction)

0.5406658530650441

## Стекинг

In [18]:
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
lr = LinearRegression()

sr = StackingCVRegressor(regressors=(rf, gb),
                         meta_regressor=lr)

params = {
    'randomforestregressor__max_depth': [2, 5, 8]
}

grid = GridSearchCV(estimator=sr,
                    param_grid=params,
                    cv=5,
                    n_jobs=-1)
grid.fit(X_train, y_train)

In [19]:
grid.best_score_, grid.best_params_

(0.5865938026918748, {'randomforestregressor__max_depth': 8})