In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import sklearn.metrics as metrics
import statsmodels.api as sm

In [2]:
df = pd.read_csv('./kaggle_data/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# These are the manually picked columns that are all categorical
categorical_cols = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
                    'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
                    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                    'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                    'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 
                    'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 
                    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 
                    'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

# Category cols that will be used for the linear regression model
category_cols = ['Neighborhood', 'SaleType']

# Continuous cols that will be used for the linear regression model
continuous_cols = ['OverallQual', 'GrLivArea', 'YearBuilt']

In [4]:
# Select categorical columns and get dummy column values (different approach from what we did before with category type)
# Preferably pick columns that are not like rating columns (Bad, Okay, Great). We should handle those separately
dummy_df = pd.get_dummies(df[category_cols])

In [5]:
YVar = df[['SalePrice']]
XVar = pd.concat([df[continuous_cols], dummy_df], axis=1)

In [6]:
# Split data 80-20 so we can test how well the model works
msk = np.random.rand(len(df)) < 0.8

train_x = XVar[msk]
train_y = YVar[msk]

test_x = XVar[~msk]
test_y = YVar[~msk]

In [7]:
# Run linear regression with statsmodels to get statistics
sm_linear_model = sm.OLS(train_y, train_x).fit()
print(sm_linear_model.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.810
Model:                            OLS   Adj. R-squared:                  0.804
Method:                 Least Squares   F-statistic:                     138.0
Date:                Sat, 11 Apr 2020   Prob (F-statistic):               0.00
Time:                        23:43:42   Log-Likelihood:                -13928.
No. Observations:                1171   AIC:                         2.793e+04
Df Residuals:                    1135   BIC:                         2.811e+04
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
OverallQual           1.896e+04 

In [8]:
# Run linear regression with sklearn
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)

# Make predictions using the testing set, then convert to pandas series
saleprice_preds = regr.predict(test_x)
saleprice_preds = pd.Series([x for lst in saleprice_preds for x in lst])

# Print out coefficient information (coefficient of determination is perfect if it is 1)
#print('Coefficients: \n', regr.coef_)
explained_variance = metrics.explained_variance_score(test_y, saleprice_preds)
mean_absolute_error = metrics.mean_absolute_error(test_y, saleprice_preds) 
mse = metrics.mean_squared_error(test_y, saleprice_preds) 
mean_squared_log_error = metrics.mean_squared_log_error(test_y, saleprice_preds)
median_absolute_error = metrics.median_absolute_error(test_y, saleprice_preds)
r2 = metrics.r2_score(test_y, saleprice_preds)

print('explained_variance: ', round(explained_variance,4))    
print('mean_squared_log_error: ', round(mean_squared_log_error,4))
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))

explained_variance:  0.7211
mean_squared_log_error:  0.0373
r2:  0.7211
MAE:  23899.0467
MSE:  1433775633.8333
RMSE:  37865.2299


In [9]:
# Construct the results dataframe
actual_price = test_y.reset_index()['SalePrice']
pred_price = saleprice_preds.round()

res = pd.DataFrame({'actual_price': actual_price, 'predicted_price': pred_price})
res

Unnamed: 0,actual_price,predicted_price
0,208500,218105.0
1,129900,182107.0
2,129500,125715.0
3,345000,373034.0
4,279500,231762.0
...,...,...
284,149700,139849.0
285,191000,244207.0
286,129000,137049.0
287,136000,175525.0


In [10]:
res['diff'] = res.actual_price - res.predicted_price
res['percent_diff'] = (res['diff'] / res.actual_price * 100).round(decimals=2)
res

Unnamed: 0,actual_price,predicted_price,diff,percent_diff
0,208500,218105.0,-9605.0,-4.61
1,129900,182107.0,-52207.0,-40.19
2,129500,125715.0,3785.0,2.92
3,345000,373034.0,-28034.0,-8.13
4,279500,231762.0,47738.0,17.08
...,...,...,...,...
284,149700,139849.0,9851.0,6.58
285,191000,244207.0,-53207.0,-27.86
286,129000,137049.0,-8049.0,-6.24
287,136000,175525.0,-39525.0,-29.06


In [11]:
print('Average percent_difference: %.2f' % (res.percent_diff.abs().mean()))

Average percent_difference: 14.38


In [12]:
# Predicted price is actually really close to actual, but not when actual price is very very high
res.head(100).plot(kind='line', y=['actual_price', 'predicted_price'])

<matplotlib.axes._subplots.AxesSubplot at 0x1fcc121cfc8>