In [1]:
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
boston_data = load_boston()
print(boston_data['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
X = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
y = boston_data.target

In [4]:
X.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1


In [5]:
X.shape

(506, 13)

In [6]:
y[:10]

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9])

In [7]:
y.shape

(506,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=41)

In [9]:
print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, '
      f'y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

X_train shape: (379, 13), X_test shape: (127, 13), y_train shape: (379,), y_test shape: (127,)


In [10]:
model = LinearRegression()
model.fit(X_train, y_train);

In [11]:
preds_train = model.predict(X_train)

In [12]:
preds_test = model.predict(X_test)

In [13]:
preds_train_df = pd.DataFrame(preds_train, columns=['prediction'])
preds_train_df['target'] = y_train
preds_train_df.head(10)

Unnamed: 0,prediction,target
0,28.961183,28.6
1,25.60451,29.6
2,25.443472,25.0
3,16.99158,10.2
4,38.242076,43.1
5,17.706788,14.9
6,29.794092,24.6
7,20.979059,19.7
8,28.157681,22.8
9,17.152752,22.5


In [14]:
preds_train.shape

(379,)

In [15]:
preds_test_df = pd.DataFrame(preds_test, columns=['prediction'])
preds_test_df['target'] = y_test
preds_test_df.head(10)

Unnamed: 0,prediction,target
0,34.704252,41.3
1,24.177285,22.7
2,15.40839,23.1
3,32.087011,29.0
4,31.998783,31.1
5,20.551023,21.4
6,15.631726,18.4
7,30.742747,30.5
8,37.583024,36.0
9,-0.278141,17.9


In [16]:
preds_test.shape

(127,)

$MSE$

In [17]:
print(f'Средняя квадратичная ошибка (MSE) модели на обучающей выборке {round(mean_squared_error(y_train, preds_train), 4)}')
print(f'Средняя квадратичная ошибка (MSE) модели на тестовой выборке {round(mean_squared_error(y_test, preds_test), 4)}')

Средняя квадратичная ошибка (MSE) модели на обучающей выборке 23.1456
Средняя квадратичная ошибка (MSE) модели на тестовой выборке 21.1398


$R^2$

In [18]:
print(f'Коэффициент детерминации (R^2) модели на обучающей выборке {round(r2_score(y_train, preds_train), 4)}')
print(f'Коэффициент детерминации (R^2) модели на тестовой выборке {round(r2_score(y_test, preds_test), 4)}')

Коэффициент детерминации (R^2) модели на обучающей выборке 0.7605
Коэффициент детерминации (R^2) модели на тестовой выборке 0.5469
