<a href="https://colab.research.google.com/github/danamyergaliyeva/ml_labs/blob/master/BostonLinear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from numpy.linalg import inv
from sklearn.datasets import load_boston
from statsmodels.regression.linear_model import OLS

In [0]:
# load the boston data set
boston = load_boston()

# obtain the feature matrix as a numpy array
X = boston.data

# obtain the target variable as a numpy array
y = boston.target

print(X.shape)

(506, 13)


In [0]:
feature_names = boston.feature_names
print(feature_names)
print(boston.DESCR)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial hig

In [0]:
# create vector of ones...
int = np.ones(shape=y.shape)[..., None]

#...and add to feature matrix
X = np.concatenate((int, X), 1)

In [0]:
# calculate coefficients using closed-form solution
coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)

In [0]:
# extract the feature names of the boston data set and prepend the intercept
feature_names = np.insert(boston.feature_names, 0, 'INT')

# collect results into a DataFrame for pretty printing
results = pd.DataFrame({'coeffs':coeffs}, index=feature_names)

print(results.round(2))

         coeffs
INT       36.46
CRIM      -0.11
ZN         0.05
INDUS      0.02
CHAS       2.69
NOX      -17.77
RM         3.81
AGE        0.00
DIS       -1.48
RAD        0.31
TAX       -0.01
PTRATIO   -0.95
B          0.01
LSTAT     -0.52


In [0]:
# create a linear model and extract the parameters
coeffs_lm = OLS(y, X).fit().params

In [0]:
results['coeffs_lm'] = coeffs_lm

print(results.round(2))

         coeffs  coeffs_lm
INT       36.46      36.46
CRIM      -0.11      -0.11
ZN         0.05       0.05
INDUS      0.02       0.02
CHAS       2.69       2.69
NOX      -17.77     -17.77
RM         3.81       3.81
AGE        0.00       0.00
DIS       -1.48      -1.48
RAD        0.31       0.31
TAX       -0.01      -0.01
PTRATIO   -0.95      -0.95
B          0.01       0.01
LSTAT     -0.52      -0.52
