In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import sklearn.model_selection as ms
from sklearn import ensemble
from sklearn import metrics
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go

In [156]:
import warnings
warnings.filterwarnings('ignore')

In [157]:
df = pd.read_csv('model_data.csv', index_col=0,low_memory=False)

In [158]:
df = df.fillna(0)
df = df.sort_index(axis=1)
# sort column names alphabetically

In [159]:
df.head()

Unnamed: 0_level_0,AllBathAbv,AllBathBsmt,BsmtCond,BsmtUnfSF,CentralAir,ExterQual,FireplaceQu,Foundation,GarageCars,GarageQual,GoodLivArea,HasPool,HeatingQC,KitchenQual,MSSubClass,Neighborhood,PavedDrive,PorchArea,SalePrice,YearBuilt
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
526301100,1.0,1.0,Gd,441.0,Y,TA,Gd,CBlock,2.0,TA,2295.0,0,Fa,TA,1Fl,NAmes,P,272,215000,1960
526302030,2.0,1.0,TA,171.0,Y,TA,,CBlock,2.0,TA,2300.0,0,Gd,TA,1Fl,NAmes,Y,280,149900,1954
526302040,1.0,1.0,TA,235.0,Y,TA,TA,CBlock,1.0,TA,1797.0,0,TA,Gd,1Fl,NAmes,Y,280,157500,1956
526302110,1.0,0.0,Fa,318.0,Y,TA,,CBlock,1.0,TA,1446.0,0,TA,TA,1Fl,NAmes,Y,0,124500,1956
526302120,1.5,0.0,TA,490.0,Y,TA,TA,CBlock,3.0,TA,2924.0,0,Fa,TA,1Fl,NAmes,Y,0,169000,1957


In [160]:
dummies = pd.get_dummies(df[['AllBathBsmt','AllBathAbv','BsmtCond','CentralAir','ExterQual','FireplaceQu','Foundation',
                          'GarageCars','GarageQual', 'HasPool','HeatingQC', 'KitchenQual','MSSubClass','Neighborhood','PavedDrive']],drop_first=True)

In [161]:
df_constants = df.drop(['AllBathBsmt','AllBathAbv','BsmtCond','CentralAir','ExterQual','FireplaceQu','Foundation',
           'GarageCars','GarageQual', 'HasPool','HeatingQC', 'KitchenQual','MSSubClass','Neighborhood','PavedDrive'],axis=1)

In [162]:
X = pd.concat([dummies,df_constants],axis=1)
y = X['SalePrice']
X = X.drop(['SalePrice'],axis=1)

In [163]:
if len(X.index) == len(y.index):
    print('There is an equal number of rows')
else : print('Unequal rows')

There is an equal number of rows


In [164]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)
mlr = LinearRegression()

In [165]:
mlr.fit(x_train,y_train)
y_predict =  mlr.predict(x_test)
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_predict})
mlr_diff.head()

Unnamed: 0_level_0,Actual value,Predicted value
PID,Unnamed: 1_level_1,Unnamed: 2_level_1
535457010,160000,125439.244936
527401130,133000,130699.446467
907126030,155000,159889.002607
528315080,300000,298352.639583
535457040,141000,142521.520271


In [166]:
meanAbErr = metrics.mean_absolute_error(y_test, y_predict)
meanSqErr = metrics.mean_squared_error(y_test, y_predict)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_predict))
print('R squared: {:.2f}'.format(mlr.score(X,y) * 100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 88.82
Mean Absolute Error: 17564.983216974302
Mean Square Error: 763251297.233679
Root Mean Square Error: 27627.003044732865
