In [11]:
!pip install boston
import pandas as pd
from boston import load_boston
import numpy as np



## Read Data

In [19]:
data = load_boston()
data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])

In [44]:
X = data.data
features = data.feature_names
y = data.target

In [29]:
df = pd.DataFrame(X,columns = features)
df['target'] = y
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


## Обучение модели

In [52]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X, y)
prediction = lr.predict(X)

## Метрики

In [77]:
pred_df = pd.DataFrame({
    'truth': y,
    'prediction': prediction
})
pred_df['difference'] = pred_df['prediction'] - pred_df['truth']
pred_df.head()

Unnamed: 0,truth,prediction,difference
0,24.0,30.003843,6.003843
1,21.6,25.025562,3.425562
2,34.7,30.567597,-4.132403
3,33.4,28.607036,-4.792964
4,36.2,27.943524,-8.256476


In [82]:
pred_df['difference'].sum() / pred_df.shape[0]

9.113482915183893e-15

## MAE - Mean Absolute Error
## $$MAE = \frac{1}{n}\sum_i^n{|y - y_{pred}|}$$

In [167]:
pred_df['absolute'] = abs(pred_df['prediction'] - pred_df['truth'])
pred_df['absolute'].sum() / pred_df.shape[0]

3.270862810900318

In [106]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(pred_df['truth'], pred_df['prediction'])

3.270862810900318

## MSE - Mean Square Error
## $$MSE = \frac{1}{n}\sum_i^n{(y - y_{pred})^2}$$

In [129]:
pred_df['squared'] = (pred_df['prediction'] - pred_df['truth']) ** 2
pred_df['squared'].mean()

21.8948311817292

In [133]:
from sklearn.metrics import mean_squared_error
mean_squared_error(pred_df['truth'], pred_df['prediction'])

21.8948311817292

## RMSE - Root Mean Square Error
## $$RMSE = \sqrt{\frac{1}{n}\sum_i^n{(y - y_{pred})^2}}$$

In [135]:
pred_df.head(2)

Unnamed: 0,truth,prediction,difference,absolute,squared
0,24.0,30.003843,6.003843,6.003843,36.046135
1,21.6,25.025562,3.425562,3.425562,11.734478


In [137]:
np.sqrt(pred_df['squared'].mean())

4.679191295697281

## $R^2$
## $$R^2 = 1 - \frac{\frac{1}{n}\sum^{n}_{i}{(y - y_{pred})^2}}{\frac{1}{n}\sum^{n}_{i}{(y - \bar{y})^2}}$$

In [143]:
pred_df['constant'] = pred_df['truth'].mean()
pred_df

Unnamed: 0,truth,prediction,difference,absolute,squared,constant
0,24.0,30.003843,6.003843,6.003843,36.046135,22.532806
1,21.6,25.025562,3.425562,3.425562,11.734478,22.532806
2,34.7,30.567597,-4.132403,4.132403,17.076757,22.532806
3,33.4,28.607036,-4.792964,4.792964,22.972499,22.532806
4,36.2,27.943524,-8.256476,8.256476,68.169392,22.532806
...,...,...,...,...,...,...
501,22.4,23.533341,1.133341,1.133341,1.284461,22.532806
502,20.6,22.375719,1.775719,1.775719,3.153178,22.532806
503,23.9,27.627426,3.727426,3.727426,13.893705,22.532806
504,22.0,26.127967,4.127967,4.127967,17.040110,22.532806


In [151]:
mse_constant = mean_squared_error(pred_df['truth'] , pred_df['constant'])
mse_constant

84.41955615616556

In [155]:
mse_model = mean_squared_error(pred_df['truth'], pred_df['prediction'])
mse_model

21.8948311817292

In [157]:
1 - mse_model / mse_constant

0.7406426641094095

In [161]:
from sklearn.metrics import r2_score

r2_score(pred_df['truth'], pred_df['prediction'])

0.7406426641094095

## Summary
<table>

<tr>
<td>
Метрика
</td>

<td>
Формула
</td>

<td>
Диапазоны значений
</td>

<td>
Идеальное значение
</td>
</tr>

<tr>
<td>
MAE (mean absolute error, средне-абсолютнаошибка)
</td>

<td>
$$MAE = \frac{1}{n}\sum_i^n{|y - y_{pred}|}$$

</td>

<td>
[0, +$\infty$)
</td>

<td>
0
</td>
</tr>

<tr>
<td>
MSE (mean squared error, средне-квадратичная ошибка)

</td>

<td>
$$MSE = \frac{1}{n}\sum_i^n{(y - y_{pred})^2}$$

</td>

<td>
[0, +$\infty$)
</td>

<td>
0
</td>
</tr>

<tr>
<td>
RMSE (root mean squared error).

</td>

<td>
$$RMSE = \sqrt{\frac{1}{n}\sum_i^n{(y - y_{pred})^2}}$$

</td>

<td>
[0, +$\infty$)
</td>

<td>
0
</td>
</tr>

<tr>
<td>
Коэффициент детерминации $R^{2}$

</td>

<td>
$$R^2 = 1 - \frac{\frac{1}{n}\sum^{n}_{i}{(y - y_{pred})^2}}{\frac{1}{n}\sum^{n}_{i}{(y - \bar{y})^2}}$$

</td>

<td>
(-$\infty$, 1]

</td>

<td>
1
</td>
</tr>
</table>