In [1]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Insert dataset

In [2]:
dataset = pd.read_csv('real_estate_price_size_year.csv')
dataset.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [3]:
dataset.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


### Define dependent and independent variables

In [4]:
x = dataset[['size','year']]
y = dataset['price']

### Create multiple regression

In [5]:
reg = LinearRegression().fit(x,y)

In [6]:
# reg intercept
reg.intercept_

-5772267.017463278

In [7]:
# reg coef
reg.coef_

array([ 227.70085401, 2916.78532684])

In [8]:
# reg r-squared
reg.score(x,y)

0.7764803683276793

In [9]:
# reg adjustment r-squared
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]

adj_R2 = 1-(1-r2) * (n-1) / (n-p-1)
adj_R2

0.77187171612825

### Example of prediction

In [10]:
reg.predict([[750,2009]])

array([258330.34465995])

### Remove useless features

because the  $R^2$  is a little higher than $Adjusted R^2$ so we DON'T need to remove any features

### Finding the P-values

In [11]:
from sklearn.feature_selection import f_regression
p_values = f_regression(x,y)[1]
p_values

array([8.12763222e-31, 3.57340758e-01])

### Create a Summary Table

In [12]:
summary_table = pd.DataFrame(data=x.columns.values, columns=['Features'])
summary_table['Coefficient'] = reg.coef_
summary_table['P-Values'] = p_values.round(4)
summary_table

Unnamed: 0,Features,Coefficient,P-Values
0,size,227.700854,0.0
1,year,2916.785327,0.3573


"year" feature is useless so we can remove it

## Prediction Using Standardized Dataset

### Import relevant libraries

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
dataset.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


### Standardized dependend variable (size and year)

In [17]:
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)
x_scaled

array([[-0.70816415,  0.51006137],
       [-0.66387316, -0.76509206],
       [-1.23371919,  1.14763808],
       [ 2.19844528,  0.51006137],
       [ 1.42498884, -0.76509206],
       [-0.937209  , -1.40266877],
       [-0.95171405,  0.51006137],
       [-0.78328682, -1.40266877],
       [-0.57603328,  1.14763808],
       [-0.53467702, -0.76509206],
       [ 0.69939906, -0.76509206],
       [ 3.33780001, -0.76509206],
       [-0.53467702,  0.51006137],
       [ 0.52699137,  1.14763808],
       [ 1.51100715, -1.40266877],
       [ 1.77668568, -1.40266877],
       [-0.54810263,  1.14763808],
       [-0.77276222, -1.40266877],
       [-0.58004747, -1.40266877],
       [ 0.58943055,  1.14763808],
       [-0.78365788,  0.51006137],
       [-1.02322731,  0.51006137],
       [ 1.19557293,  0.51006137],
       [-1.12884431,  0.51006137],
       [-1.10378093, -0.76509206],
       [ 0.84424715,  1.14763808],
       [-0.95171405,  1.14763808],
       [ 1.62279723,  0.51006137],
       [-0.58004747,

### Create regression

In [18]:
scaled_reg = LinearRegression().fit(x_scaled,y)

In [20]:
# scaled_reg intercept
scaled_reg.intercept_

292289.4701599997

In [21]:
# scaled_reg coef
scaled_reg.coef_

array([67501.57614152, 13724.39708231])

In [19]:
# scaled_reg R-squared
scaled_reg.score(x_scaled,y)

0.7764803683276793

In [23]:
# scaled_reg adjusted R-squared
r2 = scaled_reg.score(x_scaled,y)
n = x.shape[0]
p = x.shape[1]

adj_R2 = 1-(1-r2) * (n-1) / (n-p-1)
adj_R2

0.77187171612825

R-squared and adjusted R-squared value is almost same

### Make prediction

assume size = 750 and year = 2009

In [45]:
new_data = [[750,2009]]
new_data = scaler.transform(new_data)
scaled_reg.predict(new_data)

array([258330.34465995])

### Create regression only use size feature

In [26]:
simple_reg = LinearRegression().fit(x_scaled[:,0].reshape(-1,1),y)

In [28]:
# R-squared value
simple_reg.score(x_scaled[:,0].reshape(-1,1),y)

0.7447391865847587

simple_reg r-squared value is lower than scaled_reg r-squared

### Find P-values

In [41]:
scaled_p_values = f_regression(x_scaled,y)[1]
scaled_p_values

array([8.12763222e-31, 3.57340758e-01])

### Create summary table

In [44]:
scaled_summary = pd.DataFrame(data=[['Bias'],['Size'],['Year']], columns=['Features'])
scaled_summary['Weights'] = scaled_reg.intercept_ , scaled_reg.coef_[0], scaled_reg.coef_[1]
scaled_summary['P-Values'] = '-', scaled_p_values[0].round(4), scaled_p_values[1].round(4)
scaled_summary

Unnamed: 0,Features,Weights,P-Values
0,Bias,292289.47016,-
1,Size,67501.576142,0
2,Year,13724.397082,0.3573


It seems that 'Year' is not event significant, therefore we should remove it from the model.

Note that this dataset is extremely clean and probably artificially created, therefore standardization does not really bring any value to it.