In [30]:
# Importing necessary packages
import pandas as pd # python's data handling package
import numpy as np # python's scientific computing package
import matplotlib.pyplot as plt # python's plotting package
from sklearn.metrics import mean_squared_error as mse

##### Loading data.

In [32]:
# Both features and target have already been scaled: mean = 0; SD = 1
data = pd.read_csv('Houseprice_data_scaled.csv') 

In [33]:
# print summary statistics for train data
print("\nSummary statistics\n", data.describe())
print("\nCorrelation matrix\n", data.corr())


Summary statistics
            LotArea  OverallQual  OverallCond    YearBuilt  YearRemodAdd  \
count  2908.000000  2908.000000  2908.000000  2908.000000   2908.000000   
mean     -0.014297    -0.011672    -0.001960    -0.020343     -0.032348   
std       0.864624     1.012869     1.000959     1.011374      1.011740   
min      -0.992707    -3.698901    -4.115272    -3.337719     -1.691761   
25%      -0.308670    -0.797802    -0.512407    -0.598187     -0.965106   
50%      -0.089254    -0.072527    -0.512407     0.036583      0.391317   
75%       0.141143     0.652747     0.388309     0.972033      0.924198   
max      22.739771     2.828571     3.090457     1.272713      1.214860   

        BsmtFinSF1    BsmtUnfSF  TotalBsmtSF     1stFlrSF     2ndFlrSF  ...  \
count  2908.000000  2908.000000  2908.000000  2908.000000  2908.000000  ...   
mean     -0.016787    -0.010657    -0.024176    -0.009944    -0.007302  ...   
std       0.996796     0.991912     1.006134     1.003188     0.99

In [34]:
# Export the correlation matrix to csv
data.corr().to_csv('correlationmatrix.csv')

In [35]:
# First 1800 data items are training set; the next 600 are the validation set; the rest are the test set
train = data.iloc[:1800] 
val = data.iloc[1800:2400]
test=data.iloc[2400:2908]

In [36]:
# Creating the "X" and "y" variables. We drop sale price from "X"
X_train, X_val,X_test = train.drop('Sale Price', axis=1), val.drop('Sale Price', axis=1), test.drop('Sale Price', axis=1)
y_train, y_val,y_test = train[['Sale Price']], val[['Sale Price']], test[['Sale Price']]

###### Linear Regression

In [37]:
# Importing models
from sklearn.linear_model import LinearRegression

In [40]:
#  MSE at the train set
lr=LinearRegression()
lr.fit(X_train,y_train)
pred=lr.predict(X_train)
print(mse(y_train,pred))

0.1140152643124634


In [41]:
# MSE at the validation dataset
lr.fit(X_train,y_train)
pred=lr.predict(X_val)
print(mse(y_val,pred))

0.11702499460121657


In [39]:
# Create dataFrame with corresponding feature and its respective coefficients
coeffs = pd.DataFrame(
    [
        ['intercept'] + list(X_train.columns),
        list(lr.intercept_) + list(lr.coef_[0])
    ]
).transpose().set_index(0)

lreg_coefficient = pd.DataFrame()
lreg_coefficient["Features"] = X_train.columns
lreg_coefficient['Coef Estimate'] = pd.Series(lr.coef_[0])
print(lreg_coefficient)

         Features  Coef Estimate
0         LotArea       0.079000
1     OverallQual       0.214395
2     OverallCond       0.096479
3       YearBuilt       0.160799
4    YearRemodAdd       0.025352
5      BsmtFinSF1       0.091466
6       BsmtUnfSF      -0.033080
7     TotalBsmtSF       0.138199
8        1stFlrSF       0.152786
9        2ndFlrSF       0.132765
10      GrLivArea       0.161303
11       FullBath      -0.020808
12       HalfBath       0.017194
13   BedroomAbvGr      -0.083520
14   TotRmsAbvGrd       0.083220
15     Fireplaces       0.028258
16     GarageCars       0.037997
17     GarageArea       0.051809
18     WoodDeckSF       0.020834
19    OpenPorchSF       0.034098
20  EnclosedPorch       0.006822
21        Blmngtn      -0.018431
22        Blueste      -0.012921
23         BrDale      -0.024626
24        BrkSide       0.020762
25        ClearCr      -0.007378
26        CollgCr      -0.006754
27        Crawfor       0.036323
28        Edwards      -0.000690
29        

In [42]:
lreg_coefficient.to_csv('lreg_coefficient.csv')

In [43]:
print('R squared training set', round(lr.score(X_train, y_train)*100, 2))
print('R squared test set', round(lr.score(X_test, y_test)*100, 2))

R squared training set 88.59
R squared test set 87.86
