In [11]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

dataset = load_boston()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

In [8]:

X = dataset.data
y = dataset.target

data = pd.DataFrame(np.c_[X, y], columns=np.hstack((dataset.feature_names, 'TARGET')))

In [9]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
TARGET     506 non-null float64
dtypes: float64(14)
memory usage: 55.4 KB


In [10]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,TARGET
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [13]:
X = data.drop('TARGET', axis=1)
y = data.TARGET

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
sc = StandardScaler()

X = sc.fit_transform(X_train)

## KNN

In [27]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

knn_param_grid = dict(n_neighbors=list(range(2, 20)),
                      weights=['uniform','distance'],
                      algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'])
knn = KNeighborsRegressor()
knn_grid = GridSearchCV(knn, knn_param_grid, cv=10, scoring='neg_mean_absolute_error',n_jobs=-1)
knn_grid.fit(X_train, y_train)


print(knn_grid.best_score_)
print(knn_grid.best_params_)
print(knn_grid.best_estimator_)

-2.6495912558541876
{'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'}
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='distance')




# Tree ensemble

In [30]:
from sklearn.ensemble import GradientBoostingRegressor
ensemble_param_grid = dict(loss = ['ls', 'lad', 'huber', 'quantile'],
                           learning_rate = list(np.arange(0.01, 0.32,0.02).round(2)),
                           subsample = list(np.arange(0.1, 1,0.1).round(2)))
ensemble = GradientBoostingRegressor()
ensemble_grid = GridSearchCV(ensemble, ensemble_param_grid, cv=10, scoring='neg_mean_absolute_error',n_jobs=-1)
ensemble_grid.fit(X_train, y_train)
print(ensemble_grid.best_score_)
print(ensemble_grid.best_params_)
print(ensemble_grid.best_estimator_)

-2.1263657669415563
{'learning_rate': 0.13, 'loss': 'ls', 'subsample': 0.6}
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.13, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=0.6, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)




# Linear Regression

In [31]:
from sklearn.linear_model import LinearRegression
lr_param_grid = dict(fit_intercept = [False, True],
                     normalize = [False, True],
                     copy_X  = [False, True])
lr = LinearRegression()
lr_grid = GridSearchCV(lr, lr_param_grid,  cv=10, scoring='neg_mean_absolute_error',n_jobs=-1)
lr_grid.fit(X_train, y_train)
print(lr_grid.best_score_)
print(lr_grid.best_params_)
print(lr_grid.best_estimator_)

-3.4335330398107473
{'copy_X': False, 'fit_intercept': True, 'normalize': False}
LinearRegression(copy_X=False, fit_intercept=True, n_jobs=None, normalize=False)




# RandomForest

In [37]:
from sklearn.ensemble import RandomForestRegressor
rf_param_grid = dict(n_estimators = range(1,15, 1), max_depth = range(1, 15))
rf = RandomForestRegressor()
rf_grid = GridSearchCV(rf, rf_param_grid,  cv=10, scoring='neg_mean_absolute_error',n_jobs=-1)
rf_grid.fit(X_train, y_train)
print(rf_grid.best_score_)
print(rf_grid.best_params_)
print(rf_grid.best_estimator_)

-2.304640358277972
{'max_depth': 10, 'n_estimators': 10}
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)




Видим что лучшие результаты у ансамбля деревья решений.

Теперь проверим результаты наих моделей на тестовой выборке

In [None]:
#knn

In [43]:
mean_absolute_error(knn_grid.best_estimator_.predict(X_test), y_test)

2.7507862681094863

In [None]:
# Tree Ensemble

In [44]:
mean_absolute_error(ensemble_grid.best_estimator_.predict(X_test), y_test)

1.995830335176924

In [None]:
# Linear Regression

In [45]:
mean_absolute_error(lr_grid.best_estimator_.predict(X_test), y_test)

3.2675857989097947

In [None]:
# Random Forest

In [46]:
mean_absolute_error(rf_grid.best_estimator_.predict(X_test), y_test)

2.155779854174389

На первом месте по качеству у нас получился ансамбль деревьев а после него с небольшим отрывом Random Forest

Такие же места по качеству получились и на трейновой выборке
