In [1]:
import numpy as np
from numpy.linalg import pinv
import pandas as pd

In [2]:
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
datasets = pd.read_csv('../../dataset/regression/california_house_price.csv')
datasets.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Unnamed: 6
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0,
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0,
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0,
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0,
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5,


In [4]:
datasets.drop('Unnamed: 6', axis=1, inplace=True)

In [5]:
datasets.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [6]:
datasets.shape

(5000, 6)

In [7]:
X = datasets.drop('Price', axis=1)
y = datasets['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
lr = LinearRegression()

In [9]:
lr.fit(X_train, y_train)
pred_train = lr.predict(X_train)
pred_test = lr.predict(X_test)
print("train r2 score: ",r2_score(y_train, pred_train))
print("test r2 score: ", r2_score(y_test, pred_test))

train r2 score:  0.9186953001974186
test r2 score:  0.9165334724211642


In [10]:
lr.coef_

array([2.15898874e+01, 1.66102501e+05, 1.19895936e+05, 1.90107101e+03,
       1.52315025e+01])

In [11]:
lr.intercept_

-2638142.1104272725

### Find coef manually

In [12]:
X = scale(X_train, with_std=False)

In [13]:
coef = pinv(X.T@X)@X.T@y_train
coef

array([2.15898874e+01, 1.66102501e+05, 1.19895936e+05, 1.90107101e+03,
       1.52315025e+01])

### Find intercept manually

In [14]:
XM = X_train.mean()

In [15]:
coef.shape == XM.shape

True

In [16]:
intercept = y_train.mean() - np.sum([i*j for i, j in zip(coef, XM)])
intercept

-2638142.1104282574

In [17]:
lr.intercept_

-2638142.1104272725