# Прогнозирование цен объектов недвижимости #

Датасет:

*Kaggle House Prices*

[https://www.kaggle.com/c/house-prices-advanced-regression-techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

Решаем задачу регрессии: прогнозирование стоимости объекта недвижимости в зависимости от его характеристик

Используем ансамблирование моделей для повышения точности результатов

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
kaggle_train = pd.read_csv('train.csv')
kaggle_test = pd.read_csv('test.csv')

In [3]:
print(kaggle_train.shape)
print(kaggle_test.shape)

(1460, 81)
(1459, 80)


In [4]:
kaggle_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
kaggle_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [6]:
kaggle_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [7]:
kaggle_test.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Разделяем обучающую и валидационную выборки, выделяем hold-out датасет ###

In [8]:
# выделяем из датасета значения целевой переменной
from sklearn.model_selection import train_test_split
X = kaggle_train.copy()
X.drop(columns=['SalePrice'], inplace=True)
y = pd.DataFrame(kaggle_train['SalePrice'])

In [9]:
X.shape, y.shape

((1460, 80), (1460, 1))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
X_train.shape, y_train.shape

((1168, 80), (1168, 1))

In [12]:
X_test.shape, y_test.shape

((292, 80), (292, 1))

### Выделяем числовые данные ###

Так как решаем задачу регрессии, а фичей много, то оставим только числовые фичи и с ними будем работать дальше.

In [13]:
X_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
                  ...   
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object


In [14]:
X_train_num = X_train.select_dtypes(exclude='object')

In [15]:
X_train_num.dtypes

Id                 int64
MSSubClass         int64
LotFrontage      float64
LotArea            int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageYrBlt      float64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
MoSold             int64
YrSold             int64
dtype: object

In [16]:
# параметр Id лучше убрать, т.к. это просто порядковый номер объекта
# параметр MsSubClass представляет собой числовой код, это не количественная характеристика, убираем
# пареметр MoSold - месяц, в котором была совершена сделка, убираем эту фичу
cols_to_drop = ['Id', 'MSSubClass', 'MoSold']
X_train_num.drop(columns=cols_to_drop, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [17]:
X_train_num.dtypes

LotFrontage      float64
LotArea            int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageYrBlt      float64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
YrSold             int64
dtype: object

### Очищаем данные от пропусков ###

Мы работаем только с числовыми данными, поэтому пропуски можно заменить на средние значения (mean).

In [18]:
y_train.isnull().any().any()

False

In [19]:
X_train_num.isnull().any().any()

True

In [20]:
X_train_num_notnull = X_train_num.fillna(X_train_num.mean())

In [21]:
X_train_num_notnull.isnull().any().any()

False

### Нормализуем данные ###

Используем для нормализации StandardScaler

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_num_notnull_scaled = scaler.fit_transform(X_train_num_notnull)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [23]:
X_train_num_notnull_scaled.shape

(1168, 34)

### Процедура обработки данных ###
Напишем функцию для обработки данных, которую затем можно будет применить к тестовому/валидационному датасету, а также к выборке для kaggle

In [24]:
def preprocess_data(x):
    ret = x.copy()
    ret = ret.select_dtypes(exclude='object')
    ret.drop(columns=cols_to_drop, inplace=True)
    ret = ret.fillna(x.mean())
    ret = scaler.transform(ret)
    return ret

In [25]:
X_test_preprocessed = preprocess_data(X_test)

  


### Одноуровневые модели ###
Решаем задачу с помощью моделей для алгоритмов регрессии

In [26]:
# в данный словарь будем сохранять результаты работы всех моделей, чтобы потом сравнить
scores = {}

In [27]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_num_notnull_scaled, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [28]:
scores['LinearRegression'] = lr.score(X_test_preprocessed, y_test)
scores['LinearRegression']

0.7489773199510132

In [29]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train_num_notnull_scaled, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [30]:
scores['DecisionTreeRegression'] = dtr.score(X_test_preprocessed, y_test)
scores['DecisionTreeRegression']

0.7609900778259717

In [31]:
from sklearn.linear_model import Lasso
lsr = Lasso()
lsr.fit(X_train_num_notnull_scaled, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [32]:
scores['Lasso'] = lsr.score(X_test_preprocessed, y_test)
scores['Lasso']

0.7492008022068484

In [33]:
from sklearn.linear_model import Ridge
rr = Ridge()
rr.fit(X_train_num_notnull_scaled, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [34]:
scores['Ridge'] = rr.score(X_test_preprocessed, y_test)
scores['Ridge']

0.7492640942082089

In [35]:
scores

{'LinearRegression': 0.7489773199510132,
 'DecisionTreeRegression': 0.7609900778259717,
 'Lasso': 0.7492008022068484,
 'Ridge': 0.7492640942082089}

**Используем кросс-валидацию для обучения и оценки качества моделей**

In [36]:
from sklearn.model_selection import cross_val_score

In [37]:
scores['LinearRegression_cv'] = np.mean(cross_val_score(LinearRegression(), preprocess_data(X), y, cv=10))
scores['LinearRegression_cv']

  


0.7903740771376524

In [38]:
scores['DecisionTreeRegressor_cv'] = np.mean(cross_val_score(DecisionTreeRegressor(), preprocess_data(X), y, cv=10))
scores['DecisionTreeRegressor_cv']

  


0.7513985666638767

In [39]:
scores['Lasso_cv'] = np.mean(cross_val_score(Lasso(), preprocess_data(X), y, cv=10))
scores['Lasso_cv']

  


0.7903952087108608

In [40]:
scores['Ridge_cv'] = np.mean(cross_val_score(Ridge(), preprocess_data(X), y, cv=10))
scores['Ridge_cv']

  


0.7904577642763879

**Строим случайный лес и выводим важность признаков**

In [41]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50)
rf.fit(X_train_num_notnull_scaled, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [42]:
scores['RandomForestRegressor'] = rf.score(X_test_preprocessed, y_test)
scores['RandomForestRegressor']

0.869386816694792

In [43]:
feat_importances = pd.Series(rf.feature_importances_, index=X_train_num_notnull.columns)
feat_importances.nlargest(20).plot(kind='barh')

<matplotlib.axes._subplots.AxesSubplot at 0x205f02befd0>

In [44]:
scores['RandomForestRegressor_cv'] = np.mean(cross_val_score(RandomForestRegressor(n_estimators=50), preprocess_data(X), y, cv=10))
scores['RandomForestRegressor_cv']

  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.8625057185902969

In [45]:
sorted(scores.items(), key=lambda val: val[1], reverse=True)

[('RandomForestRegressor', 0.869386816694792),
 ('RandomForestRegressor_cv', 0.8625057185902969),
 ('Ridge_cv', 0.7904577642763879),
 ('Lasso_cv', 0.7903952087108608),
 ('LinearRegression_cv', 0.7903740771376524),
 ('DecisionTreeRegression', 0.7609900778259717),
 ('DecisionTreeRegressor_cv', 0.7513985666638767),
 ('Ridge', 0.7492640942082089),
 ('Lasso', 0.7492008022068484),
 ('LinearRegression', 0.7489773199510132)]

### Стекинг моделей ###

In [46]:
from sklearn.model_selection import KFold
stack_cv = KFold(n_splits=10)

In [47]:
def get_meta_features(clf, X_train, y_train, X_test, stack_cv):
    meta_train = np.zeros_like(y_train, dtype=float).reshape(-1)
    meta_test = np.zeros_like(y_test, dtype=float).reshape(-1)
    
    for i, (train_ind, test_ind) in enumerate(stack_cv.split(X_train, y_train)):
        clf.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
        meta_train[test_ind] = clf.predict(X_train.iloc[test_ind]).reshape(-1)
        meta_test += clf.predict(X_test).reshape(-1)
    
    return meta_train, (meta_test / stack_cv.n_splits)

In [48]:
X_train_df = pd.DataFrame(X_train_num_notnull_scaled, columns=X_train_num_notnull.columns)
meta_train, meta_test, col_names = [], [], []

print('Ridge features...')
meta_tr, meta_te = get_meta_features(rr, X_train_df, y_train, X_test_preprocessed, stack_cv)
meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('ridge_pred')

print('Lasso features...')
meta_tr, meta_te = get_meta_features(lsr, X_train_df, y_train, X_test_preprocessed, stack_cv)
meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('lasso_pred')

print('RandomForest features...')
meta_tr, meta_te = get_meta_features(rf, X_train_df, y_train, X_test_preprocessed, stack_cv)
meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('rf_pred')

Ridge features...
Lasso features...
RandomForest features...


  
  
  
  
  
  
  
  
  
  


In [49]:
X_meta_train = pd.DataFrame(columns=col_names)
X_meta_train['ridge_pred'] = meta_train[0]
X_meta_train['lasso_pred'] = meta_train[1]
X_meta_train['rf_pred'] = meta_train[2]
X_meta_test = pd.DataFrame(columns=col_names)
X_meta_test['ridge_pred'] = meta_test[0]
X_meta_test['lasso_pred'] = meta_test[1]
X_meta_test['rf_pred'] = meta_test[2]

In [50]:
X_meta_train.head()

Unnamed: 0,ridge_pred,lasso_pred,rf_pred
0,372963.133054,373026.547971,461419.44
1,226638.770745,226668.577358,248247.16
2,117392.315924,117367.888623,117453.2
3,162036.598736,162095.805043,154008.04
4,144874.411734,144876.256314,140944.66


In [51]:
X_meta_test.head()

Unnamed: 0,ridge_pred,lasso_pred,rf_pred
0,97775.649357,97751.528828,126490.55
1,207542.356381,207534.146133,184004.124
2,210845.301354,210815.620493,215610.788
3,254254.836778,254228.326626,259656.386
4,130689.603917,130690.143692,142052.6


In [52]:
meta_lr = LinearRegression()
meta_lr.fit(X_meta_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [53]:
scores['Stack_Ridge-Lasso-RF_LR'] = meta_lr.score(X_meta_test, y_test)
sorted(scores.items(), key=lambda val: val[1], reverse=True)

[('Stack_Ridge-Lasso-RF_LR', 0.8836823860891857),
 ('RandomForestRegressor', 0.869386816694792),
 ('RandomForestRegressor_cv', 0.8625057185902969),
 ('Ridge_cv', 0.7904577642763879),
 ('Lasso_cv', 0.7903952087108608),
 ('LinearRegression_cv', 0.7903740771376524),
 ('DecisionTreeRegression', 0.7609900778259717),
 ('DecisionTreeRegressor_cv', 0.7513985666638767),
 ('Ridge', 0.7492640942082089),
 ('Lasso', 0.7492008022068484),
 ('LinearRegression', 0.7489773199510132)]

### Выводы ###

- наилучшие результаты в решении данной задачи регрессии показал стек моделей и RandomForestRegressor
- стоит отметить, что при запуске скрипта и новом обучении моделей результаты меняются, и RandomForestRegressor зачастую даже превосходит стек
- для улучшения результатов RandomForestRegressor нужно подбирать гиперпараметры - max_depth, min_samples_leaf
- для улучшения результатов стека можно пробовать менять его состав и выбор модели для второго уровня
- результаты были бы другими, если бы использовался другой подход к предобработке данных (в нашем случае все нечисловые фичи были исключены из рассмотрения)

**Данные для сабмита на Kaggle**

In [54]:
X_test_kaggle = preprocess_data(kaggle_test)

  


In [56]:
y_pred_kaggle = rf.predict(X_test_kaggle)

In [58]:
len(y_pred_kaggle)

1459

In [60]:
kaggle_test.shape

(1459, 80)

In [61]:
kaggle_result = pd.DataFrame(columns=['Id', 'SalePrice'])
kaggle_result['Id'] = kaggle_test.Id
kaggle_result['SalePrice'] = y_pred_kaggle

In [62]:
kaggle_result.head()

Unnamed: 0,Id,SalePrice
0,1461,128632.0
1,1462,158732.0
2,1463,178949.6
3,1464,178867.3
4,1465,198131.14


In [64]:
kaggle_result.to_csv('kaggle_submit_1.csv', index=False)