In [163]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import statsmodels as stat 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
%matplotlib inline 


## Importing the Dataset

In [165]:
houseprice = pd.read_csv(r"C:\Users\silen\Desktop\PROJE 1\housing_price_dataset.csv")
houseprice["Price"] = houseprice["Price"].astype('int')
houseprice

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355
1,2459,3,2,Rural,1980,195014
2,1860,2,1,Suburb,1970,306891
3,2294,2,1,Urban,1996,206786
4,2130,5,2,Suburb,2001,272436
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080
49996,2854,2,2,Suburb,1988,374507
49997,2979,5,3,Suburb,1962,384110
49998,2596,5,2,Rural,1984,380512


## Pre-processing the Data

In [166]:
df = houseprice[["SquareFeet", "Bedrooms", "Bathrooms", "YearBuilt", "Price"]]
df

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
0,2126,4,1,1969,215355
1,2459,3,2,1980,195014
2,1860,2,1,1970,306891
3,2294,2,1,1996,206786
4,2130,5,2,2001,272436
...,...,...,...,...,...
49995,1282,5,3,1975,100080
49996,2854,2,2,1988,374507
49997,2979,5,3,1962,384110
49998,2596,5,2,1984,380512


Let's define x and y for our Linear Regression Model.

In [168]:
x = np.asarray(df[["SquareFeet", "Bedrooms", "Bathrooms", "YearBuilt"]])
y = np.asarray(df[["Price"]])
print(x[0:4])
print(y[0:4])

[[2126    4    1 1969]
 [2459    3    2 1980]
 [1860    2    1 1970]
 [2294    2    1 1996]]
[[215355]
 [195014]
 [306891]
 [206786]]


Standardizing our x array because our features has different metrics.

In [169]:
x = preprocessing.StandardScaler().fit(x).transform(x)
x[0:5]

array([[ 0.20786058,  0.44906695, -1.22011292, -0.79175085],
       [ 0.78648033, -0.44673786,  0.00561383, -0.26084154],
       [-0.25434018, -1.34254267, -1.22011292, -0.74348636],
       [ 0.49977685, -1.34254267, -1.22011292,  0.51139019],
       [ 0.21481097,  1.34487176,  0.00561383,  0.7527126 ]])

### Train / Test Splitting

In [170]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=42)
print("Train size: ", x_train.shape)
print("Test size: ", x_test.shape)

Train size:  (40000, 4)
Test size:  (10000, 4)


## Modeling (Multiple Linear Regression)

In [171]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

LR = LinearRegression()
LR.fit(x_train, y_train)

print("Coefficients: ",LR.coef_)

Coefficients:  [[57181.49346196  5837.71213108  2417.68536483  -121.96873768]]


### Predicting and Evaluating the Model

In [172]:
predictions = LR.predict(x_test)
predictions = predictions.astype('int')
predictions[0:5]

array([[218605],
       [135875],
       [256129],
       [258703],
       [278272]])

In [177]:
print("R2 Score for test set : ",metrics.r2_score(y_test, predictions))

R2 Score for test set :  0.9555011603153223


In [176]:
whole_data_pred = LR.predict(x)
whole_data_pred = whole_data_pred.astype('int')
print("R2 Score : ",metrics.r2_score(y, whole_data_pred))
print("Mean Absolute Error : ", metrics.mean_absolute_error(y, whole_data_pred))
print("Mean Squared Error : ", metrics.mean_squared_error(y, whole_data_pred))

R2 Score :  0.9567815152737913
Mean Absolute Error :  39904.34198
Mean Squared Error :  250557614.71938


In [175]:
houseprice["Predicted Price"] = pd.DataFrame(whole_data_pred)
houseprice

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price,Predicted Price
0,2126,4,1,Rural,1969,215355,236462
1,2459,3,2,Rural,1980,195014,267217
2,1860,2,1,Suburb,1970,306891,199567
3,2294,2,1,Urban,1996,206786,242536
4,2130,5,2,Suburb,2001,272436,244864
...,...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080,163724
49996,2854,2,2,Suburb,1988,374507,301187
49997,2979,5,3,Suburb,1962,384110,332412
49998,2596,5,2,Rural,1984,380512,291265
