In [241]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import sklearn.model_selection as ms
from sklearn import ensemble
from sklearn import metrics
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

In [242]:
import warnings
warnings.filterwarnings('ignore')

In [243]:
ah = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory=False)

<h1>01: Cleaning</h1>

In [244]:
ah = ah.fillna(0)

In [245]:
hoods = ah['Neighborhood']
new_hoods = []

for neigh in hoods :
    if neigh == 'Blueste' :
        new_hoods.append('SW')
    elif neigh == 'Blmngtn' :
        new_hoods.append('NO')
    elif neigh == 'BrDale' :
        new_hoods.append('NO')
    elif neigh == 'BrkSide' :
        new_hoods.append('DT')
    elif neigh == 'ClearCr' :
        new_hoods.append('NO')
    elif neigh == 'CollgCr' :
        new_hoods.append('SW')
    elif neigh == 'Crawfor' :
        new_hoods.append('SW')
    elif neigh == 'Edwards' :
        new_hoods.append('SW')
    elif neigh == 'Gilbert' :
        new_hoods.append('NO')
    elif neigh == 'IDOTRR' :
        new_hoods.append('DT')
    elif neigh == 'MeadowV' :
        new_hoods.append('SE')
    elif neigh == 'Mitchel' :
        new_hoods.append('SE')
    elif neigh == 'NAmes' :
        new_hoods.append('NO')
    elif neigh == 'NoRidge' :
        new_hoods.append('NW')
    elif neigh == 'NPkVill' :
        new_hoods.append('NO')
    elif neigh == 'NridgHt' :
        new_hoods.append('NW')
    elif neigh == 'NWAmes' :
        new_hoods.append('NO')
    elif neigh == 'OldTown' :
        new_hoods.append('DT')
    elif neigh == 'SWISU' :
        new_hoods.append('SW')
    elif neigh == 'Sawyer' :
        new_hoods.append('NW')
    elif neigh == 'SawyerW' :
        new_hoods.append('NW')
    elif neigh == 'Somerst' :
        new_hoods.append('NW')
    elif neigh == 'StoneBr' :
        new_hoods.append('NO')
    elif neigh == 'Timber' :
        new_hoods.append('SW')
    elif neigh == 'Veenker' :
        new_hoods.append('NW')
    elif neigh == 'Greens' :
        new_hoods.append('NW')
    elif neigh == 'GrnHill' :
        new_hoods.append('SO')
    elif neigh == 'Landmrk' :
        new_hoods.append('DT')
    else : print('no match')

ah['city_sec'] = new_hoods 

In [246]:
df_float = ah.select_dtypes(include='float')
df_float_to_int = df_float.astype(int)

<h5>❤️ Divide data into categorical and numerical, then check if those columns are equal</h5>

In [247]:
df_numerical = ah.select_dtypes(include='int64')
df_numerical = pd.concat([df_numerical,df_float_to_int],axis=1)
df_categorical = ah.select_dtypes(include='object')
print('There are',len(ah.columns) - (len(df_categorical.columns) + len(df_numerical.columns)),'missing columns')
if len(df_categorical.index) == len(df_numerical.index):
    print('There is an equal number of rows')
else : print('Unequal rows')
#82 columns total

There are 0 missing columns
There is an equal number of rows


In [287]:
dummies = pd.get_dummies(df_categorical,drop_first=True)
x = pd.concat([df_numerical,dummies],axis=1)
y = df_numerical['SalePrice']
# x = x.drop(columns=['SalePrice','1stFlrSF','2ndFlrSF','LowQualFinSF','BsmtFinSF1','BsmtFinSF2','MSZoning_I (all)',
#                     'Neighborhood_Blueste','Exterior2nd_PreCast','BsmtQual_Po','Electrical_SBrkr','MiscFeature_TenC','city_sec_SO',
#                    'GarageQual_Po','BsmtCond_Fa','GarageCond_Fa','Neighborhood_Greens','BsmtFinType2_BLQ','city_sec_NW',
#                    'Neighborhood_MeadowV','city_sec_SE','SaleCondition_Partial','city_sec_NO'])


<h1>02: Multiple Linear Regression</h1>

In [288]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = 2)
mlr = LinearRegression()

In [289]:
mlr.fit(x_train,y_train)
y_predict =  mlr.predict(x_test)
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_predict})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
476,124900,158646.194583
516,129850,164535.99636
627,137000,135807.30698
719,118000,110043.703072
640,75190,66103.62919


In [290]:
meanAbErr = metrics.mean_absolute_error(y_test, y_predict)
meanSqErr = metrics.mean_squared_error(y_test, y_predict)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_predict))
print('R squared: {:.2f}'.format(mlr.score(x,y) * 100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 93.48
Mean Absolute Error: 14959.213412373652
Mean Square Error: 591852005.3151277
Root Mean Square Error: 24328.008659056493


In [291]:
vif = pd.DataFrame()
vif['var'] = x_train.columns
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]

In [301]:
inf = vif['VIF'] > 5
vifs = vif.loc[inf]
vifs['var']

1                 GrLivArea
2                MSSubClass
3                   LotArea
4               OverallQual
5               OverallCond
               ...         
230               PoolQC_Gd
231               PoolQC_TA
247            SaleType_WD 
251    SaleCondition_Normal
252             city_sec_SW
Name: var, Length: 136, dtype: object