# Построение моделей

In [2]:
from sklearn.linear_model import LinearRegression as LR
import numpy as np
import pandas as pd
data = pd.read_csv('data_cleaned.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Bal,Brick,Dist,Floor,Kitsp,Livesp,Metrdist,New,Nfloor,Price,Rooms,Totsp,Walk
0,0,1,1,0.592859,4,3.135494,4.553877,7.0,0,8.0,71014225,3.0,4.770685,1.0
1,1,1,1,1.23963,6,3.621671,3.806662,9.0,0,14.0,85000000,3.0,4.737075,1.0
2,2,1,1,0.781791,7,2.564949,4.234107,6.0,0,8.0,55000000,3.0,4.820282,1.0
3,3,1,1,3.381916,2,2.079442,2.76001,20.0,1,4.0,2316750,1.0,3.430756,0.0
4,4,1,1,3.068028,2,2.4681,3.600048,15.0,1,19.0,3650000,2.0,4.301359,0.0


### Очень неплохо походили на линейные зависимости, где мы логарифмировали цену. В связи с разбросом цен хорошей метрикой качества для наших моделей могла бы стать MAPE или MSPE. Принимая всё это во внимание, будем предсказывать логарифм цен.

In [3]:
data = data.drop('Unnamed: 0', 1)
data.Price = data.Price.apply(np.log)
data.head()

Unnamed: 0,Bal,Brick,Dist,Floor,Kitsp,Livesp,Metrdist,New,Nfloor,Price,Rooms,Totsp,Walk
0,1,1,0.592859,4,3.135494,4.553877,7.0,0,8.0,18.078391,3.0,4.770685,1.0
1,1,1,1.23963,6,3.621671,3.806662,9.0,0,14.0,18.258162,3.0,4.737075,1.0
2,1,1,0.781791,7,2.564949,4.234107,6.0,0,8.0,17.822844,3.0,4.820282,1.0
3,1,1,3.381916,2,2.079442,2.76001,20.0,1,4.0,14.655676,1.0,3.430756,0.0
4,1,1,3.068028,2,2.4681,3.600048,15.0,1,19.0,15.110238,2.0,4.301359,0.0


In [4]:
data.describe()

Unnamed: 0,Bal,Brick,Dist,Floor,Kitsp,Livesp,Metrdist,New,Nfloor,Price,Rooms,Totsp,Walk
count,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0,5754.0
mean,0.918144,0.459854,2.431488,7.093848,2.159514,3.267184,10.987313,0.22732,13.825165,15.885618,1.739275,3.852422,0.696211
std,0.274169,0.498429,0.578211,5.788847,0.373936,0.427737,6.249938,0.419137,7.835282,0.532671,0.794763,0.349293,0.459932
min,0.0,0.0,0.087027,1.0,0.0,1.386294,1.0,0.0,1.0,14.151586,0.846768,2.406945,0.0
25%,1.0,0.0,2.187919,3.0,1.871802,2.944439,7.0,0.0,9.0,15.573368,1.0,3.637586,0.0
50%,1.0,0.0,2.563464,5.0,2.151762,3.258097,10.0,0.0,12.0,15.825769,2.0,3.804438,1.0
75%,1.0,1.0,2.798027,10.0,2.332144,3.526361,15.0,0.0,17.0,16.102982,2.0,4.043051,1.0
max,1.0,1.0,4.116403,45.0,3.73767,5.762051,54.0,1.0,77.0,19.191392,5.0,5.934894,1.0


### Начнём с классики - линейной регрессии.

In [6]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
lr = LR()
X_train, X_test, y_train, y_test = train_test_split(data.drop('Price', 1), data.Price, test_size = 0.33)
lr.fit(X_train, y_train)
print('RMSE: ', mse(lr.predict(X_test), y_test)**0.5)
print('MAE: ', mae(lr.predict(X_test), y_test))

RMSE:  0.220517030116
MAE:  0.153990762136


In [14]:
pd.DataFrame(lr.coef_, index=data.drop('Price', 1).columns, columns=['coef'])

Unnamed: 0,coef
Bal,-0.083781
Brick,0.033047
Dist,-0.37124
Floor,0.004738
Kitsp,0.06765
Livesp,-0.044837
Metrdist,-0.006004
New,-0.189499
Nfloor,0.005153
Rooms,-0.068258


### Для ценообразования наиболее важными являются такие признаки как полная площадь, время до метро и можно ли дойти него пешком, первичный рынок или вторичкаю

## MAE логарифма - 0.15, т.е. MAPE цены около 15%. Что ж, неплохо.

## Конечно же, попробуем xgboost.

In [8]:
from xgboost import XGBRegressor as XGB

In [9]:
%%time
xgb1 = XGB(min_child_weight=10, n_estimators=1000, max_depth=10, subsample=0.66)
xgb1.fit(X_train, y_train)
print('RMSE: ', mse(xgb1.predict(X_test), y_test)**0.5)
print('MAE: ', mae(xgb1.predict(X_test), y_test))

RMSE:  0.166778634905
MAE:  0.113442709423
CPU times: user 22.7 s, sys: 268 ms, total: 23 s
Wall time: 6.4 s


## Эксперименты показывают, что сильно меньше MAE мы уже не сделаем. MAPE около 11%.

In [10]:
%%time
xgb1 = XGB(min_child_weight=7, n_estimators=10000, max_depth=20, subsample=0.66, colsample_bylevel=0.66)
xgb1.fit(X_train, y_train)
print('RMSE: ', mse(xgb1.predict(X_test), y_test)**0.5)
print('MAE: ', mae(xgb1.predict(X_test), y_test))

RMSE:  0.167274436086
MAE:  0.112867210463
CPU times: user 1min 39s, sys: 1.26 s, total: 1min 40s
Wall time: 27.3 s


## Lasso и Ridge регрессии

In [23]:
from sklearn.linear_model import Ridge as R
from sklearn.linear_model import Lasso as L
l = L()
r = R()
l.fit(X_train, y_train)
print('RMSE: ', mse(l.predict(X_test), y_test)**0.5)
print('MAE: ', mae(l.predict(X_test), y_test))

RMSE:  0.556626172266
MAE:  0.379426297292


In [16]:
r.fit(X_train, y_train)
print('RMSE: ', mse(r.predict(X_test), y_test)**0.5)
print('MAE: ', mae(r.predict(X_test), y_test))

RMSE:  0.220346014352
MAE:  0.153859392017
