In [89]:
# Мы будем использовать в данном задании набор данных Boston, 
# где нужно предсказать стоимость жилья на основе различных характеристик расположения 
# (загрязненность воздуха, близость к дорогам и т.д.).
# https://archive.ics.uci.edu/ml/machine-learning-databases/housing/

from sklearn.datasets import load_boston
boston = load_boston()
print(boston.data.shape)
print(boston.data[:1])
print(boston.target[:5])

(506, 13)
[[  6.32000000e-03   1.80000000e+01   2.31000000e+00   0.00000000e+00
    5.38000000e-01   6.57500000e+00   6.52000000e+01   4.09000000e+00
    1.00000000e+00   2.96000000e+02   1.53000000e+01   3.96900000e+02
    4.98000000e+00]]
[ 24.   21.6  34.7  33.4  36.2]


In [90]:
# Приведите признаки в выборке к одному масштабу при помощи функции sklearn.preprocessing.scale.
from sklearn import preprocessing
scaled = preprocessing.scale(boston.data)
scaled[:1]

array([[-0.41771335,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41367189, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 ]])

In [91]:
# При использовании библиотеки scikit-learn версии 0.18.1 и выше необходимо указывать scoring='neg_mean_squared_error' 
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.18.1.


In [92]:
# Переберите разные варианты параметра метрики p по сетке от 1 до 10 с таким шагом, 
# чтобы всего было протестировано 200 вариантов
import numpy as np
size = 200
indexes = np.linspace(1.0, 10.0, size)
cv_score = np.zeros(size)

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in kf.split(scaled):
   print("TRAIN:", train_index[:15], "TEST:", test_index[:15])

TRAIN: [ 1  3  4  5  6  7  8 10 12 13 14 15 16 17 19] TEST: [ 0  2  9 11 18 22 30 33 39 46 55 63 68 69 70]
TRAIN: [ 0  1  2  4  6  8  9 10 11 12 13 14 18 20 21] TEST: [ 3  5  7 15 16 17 19 23 24 25 26 29 31 37 42]
TRAIN: [ 0  1  2  3  4  5  7  8  9 11 12 13 14 15 16] TEST: [  6  10  36  38  59  74  81  83  89  96  97 103 111 112 119]
TRAIN: [ 0  1  2  3  5  6  7  9 10 11 13 15 16 17 18] TEST: [ 4  8 12 14 27 28 32 35 40 41 44 47 51 61 62]
TRAIN: [ 0  2  3  4  5  6  7  8  9 10 11 12 14 15 16] TEST: [ 1 13 20 21 34 43 48 49 50 52 53 54 58 71 80]


In [93]:
for i in range(len(indexes)):
    param = indexes[i]
    # print('i =', i, ' p =', param)
    neigh = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='minkowski', p=param)
    neigh.fit(scaled, boston.target)
    cv_score[i] = cross_val_score(estimator=neigh, X=scaled, y=boston.target, cv=kf, scoring='neg_mean_squared_error').mean()    

In [94]:
# Negative MSE score
# https://stackoverflow.com/questions/21050110/sklearn-gridsearchcv-with-pipeline
# Those scores are negative MSE scores, i.e. negate them and you get the MSE.
# cv_score = cv_score*(-1)

In [95]:
sorted = cv_score.argsort()
sorted

array([188, 187, 186, 185, 184, 199, 183, 182, 181, 180, 179, 191, 156,
       154, 198, 155, 197, 153, 196, 190, 152, 195, 189, 194, 193, 192,
       151, 148, 178, 157, 150, 149, 146, 177, 165, 170, 167, 145, 166,
       144, 169, 168, 164, 143, 147, 142, 163, 162, 176, 173, 175, 171,
       172, 174, 141, 140, 139, 159, 158, 161, 160, 138, 135, 134, 133,
       137, 136, 132, 121, 123, 131, 127, 130, 128, 129, 122, 124, 126,
       125, 120, 119, 118, 108, 107, 109, 111, 110, 112, 106, 113, 117,
       115, 116, 114, 104, 103, 105, 100, 102, 101,  88,  99,  85,  87,
        86,  98,  97,  84,  94,  90,  83,  89,  92,  96,  95,  93,  91,
        82,  80,  81,  79,  75,  73,  74,  72,  78,  76,  77,  67,  66,
        68,  70,  69,  65,  71,  63,  64,  62,  59,  58,  61,  60,  57,
        56,  54,  55,  52,  53,  49,  50,  48,  47,  51,  46,  45,  44,
        43,  42,  25,  40,  41,  32,  31,  30,  26,  29,  38,  39,  28,
        35,  37,  36,  20,  24,  21,  23,  34,  27,  22,  19,  3

In [96]:
cv_score.argmax(axis=0)

0

In [98]:
# Ответ 1 = 9.50
indexes[0]

1.0