In [9]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model, kernel_ridge, svm

from dmba import adjusted_r2_score, AIC_score, BIC_score
from dmba import regressionSummary, exhaustive_search, backward_elimination, forward_selection, stepwise_selection

In [10]:
# importing dataset file
ds= pd.read_csv("F:/Junkyard/Studies/Data Science and Machine Learning/session_4/project_proposals/datasets/dataset2_houseprice/house_price.csv")

#ds.info()
#ds.describe()
#ds.head()


In [11]:
# extracting name of the columns (attributes) with differend data types

obj = (ds.dtypes == 'object')   # returns a dataframe that has column for index and corresponding
                        # True/False statement for second column (if data in the column is object)
object_cols = list(obj[obj].index) # obj[obj] returns rows with only True values and index method
                                    # return first column(name of the columns in original dataset)
print("Categorical variables:",len(object_cols))

#same for integers as for objects
int_ = (ds.dtypes == 'int64')
num_cols = list(int_[int_].index)
print("Integer variables:",len(num_cols))

#same for floats as for objects
fl = (ds.dtypes == 'float')
fl_cols = list(fl[fl].index)
print("Float variables:",len(fl_cols))


Categorical variables: 43
Integer variables: 35
Float variables: 3


In [12]:
#dummies (does not count NaN values if 'dummy_na' is false like it is by default)

ds2=ds[object_cols]
ds2d=pd.get_dummies(ds2,drop_first=True)            # drop_first=True parameter eliminates variables (we might not want to use that)
ds.drop(object_cols,axis=1,inplace=True)
ds=pd.concat([ds,ds2d],axis=1)


In [13]:
#filling NaN values (which are left after dummies), so that the model can work:

ds.fillna(value=0,inplace=True)


In [14]:
# Preparing dataset for feature engeneering and model selection:
from statistics import mean
predict='SalePrice'

X=(ds.drop([predict],axis=1))    #returns new dataframe, that doesn't have 'SalesPrice' in it
y=(ds[predict])             #returns true values for labels, that we want to predict or test

x_train,x_test,y_train,y_test=sklearn.model_selection.train_test_split(X,y,test_size=0.2)  #spliting into train/test samples

In [7]:
# Feature engineering
# 1 Backward elimination:
# General functions of ensemble:
def train_model(variables):
    model = linear_model.LinearRegression()
    model.fit(x_train[variables], y_train)
    return model
def score_model(model, variables):
    return AIC_score(y_train, model.predict(x_train[variables]), model)
# Assigning results:
allVariables = ds.drop(predict,axis=1).columns
best_model, best_variables = backward_elimination(allVariables, train_model, score_model, verbose=True)

Variables: Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageYrBlt, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal, MoSold, YrSold, MSZoning_FV, MSZoning_RH, MSZoning_RL, MSZoning_RM, Street_Pave, Alley_Pave, LotShape_IR2, LotShape_IR3, LotShape_Reg, LandContour_HLS, LandContour_Low, LandContour_Lvl, Utilities_NoSeWa, LotConfig_CulDSac, LotConfig_FR2, LotConfig_FR3, LotConfig_Inside, LandSlope_Mod, LandSlope_Sev, Neighborhood_Blueste, Neighborhood_BrDale, Neighborhood_BrkSide, Neighborhood_ClearCr, Neighborhood_CollgCr, Neighborhood_Crawfor, Neighborhood_Edwards, Neighborhood_Gilbert, Neighborhood_IDOTRR, Neighborhood_MeadowV, Neighborhood_Mitchel, Neighborhood_NAmes, Neighborhood

In [8]:
# 1 Backward Elimination Results:
print(best_variables)
regressionSummary(y, best_model.predict(ds[best_variables]))

['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtUnfSF', 'TotalBsmtSF', 'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageArea', 'MSZoning_RL', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Condition1_Norm', 'Condition2_PosN', 'BldgType_Twnhs', 'BldgType_TwnhsE', 'RoofMatl_CompShg', 'RoofMatl_Metal', 'RoofMatl_Tar&Grv', 'RoofMatl_WdShake', 'RoofMatl_WdShngl', 'Exterior1st_BrkFace', 'Exterior1st_CemntBd', 'Exterior2nd_CmentBd', 'ExterQual_Gd', 'ExterQual_TA', 'BsmtQual_Fa', 'BsmtQual_Gd', 'BsmtQual_TA', 'BsmtExposure_Gd', 'KitchenQual_Fa', 'KitchenQual_Gd', 'KitchenQual_TA', 'Functional_Sev', 'Functional_Typ', 'SaleType_New', 'SaleCondition_Normal', 'SaleCondition_Partial']

Regression statistics

                      Mean Error (ME) : 936.4405
       Root Mean Squared Error (RM

In [None]:
# Feature engineering:
# 2 Forward selection:
# COMMON CODE FOR METHODS 2 and 3 
# The initial model is the constant model - this requires special handling in train_model and score_model
def train_model(variables):
    if len(variables) == 0:
        return None
    model = linear_model.LinearRegression()
    model.fit(x_train[variables], y_train)
    return model
def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(y_train, [y_train.mean()] * len(y_train), model, df=1)
    return AIC_score(y_train, model.predict(x_train[variables]), model)

In [16]:

# 2 Assign results for Forward Selection:
best_model, best_variables = forward_selection(x_train.columns, train_model, score_model, verbose=True)

Variables: Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageYrBlt, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal, MoSold, YrSold, MSZoning_FV, MSZoning_RH, MSZoning_RL, MSZoning_RM, Street_Pave, Alley_Pave, LotShape_IR2, LotShape_IR3, LotShape_Reg, LandContour_HLS, LandContour_Low, LandContour_Lvl, Utilities_NoSeWa, LotConfig_CulDSac, LotConfig_FR2, LotConfig_FR3, LotConfig_Inside, LandSlope_Mod, LandSlope_Sev, Neighborhood_Blueste, Neighborhood_BrDale, Neighborhood_BrkSide, Neighborhood_ClearCr, Neighborhood_CollgCr, Neighborhood_Crawfor, Neighborhood_Edwards, Neighborhood_Gilbert, Neighborhood_IDOTRR, Neighborhood_MeadowV, Neighborhood_Mitchel, Neighborhood_NAmes, Neighborhood

In [17]:
# 2 Forward Selection results:
print(best_variables)
regressionSummary(y_test, best_model.predict(x_test[best_variables]))

['OverallQual', 'GrLivArea', 'BsmtFinSF1', 'GarageCars', 'Condition2_PosN', 'PoolQC_Gd', 'SaleType_New', 'MSSubClass', 'BsmtExposure_Gd', 'Neighborhood_NridgHt', 'Neighborhood_StoneBr', 'Neighborhood_NoRidge', 'YearBuilt', 'OverallCond', 'RoofMatl_WdShngl', 'LotArea', 'Neighborhood_Crawfor', 'BsmtQual_Gd', 'BsmtQual_TA', 'Condition1_Norm', 'Neighborhood_Somerst', 'RoofMatl_CompShg', 'PoolArea', 'PoolQC_Fa', 'TotalBsmtSF', 'BldgType_Duplex', 'Functional_Typ', 'LotConfig_CulDSac', 'BsmtFullBath', 'KitchenQual_Gd', 'KitchenQual_TA', 'KitchenQual_Fa', 'SaleCondition_Normal', 'BsmtExposure_No', 'RoofMatl_WdShake', 'RoofMatl_Tar&Grv', 'RoofMatl_Membran', 'RoofMatl_Roll', 'Neighborhood_BrkSide', '2ndFlrSF', 'LandContour_Low', 'Exterior1st_BrkFace', 'BedroomAbvGr', 'BsmtQual_Fa', 'Condition2_PosA', 'LotShape_IR2', 'MasVnrArea', 'WoodDeckSF', 'ScreenPorch', 'GarageYrBlt', 'BsmtFinType1_GLQ', '1stFlrSF', 'MasVnrType_BrkFace', 'Foundation_Wood', 'KitchenAbvGr', 'BldgType_2fmCon', 'Street_Pave', '

In [18]:
# Feature engineering
# 3 Stepwise Selection/?Regression? (just additional code to the forward selection):
best_model, best_variables = stepwise_selection(x_train.columns, train_model, score_model, verbose=True)

Variables: Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageYrBlt, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal, MoSold, YrSold, MSZoning_FV, MSZoning_RH, MSZoning_RL, MSZoning_RM, Street_Pave, Alley_Pave, LotShape_IR2, LotShape_IR3, LotShape_Reg, LandContour_HLS, LandContour_Low, LandContour_Lvl, Utilities_NoSeWa, LotConfig_CulDSac, LotConfig_FR2, LotConfig_FR3, LotConfig_Inside, LandSlope_Mod, LandSlope_Sev, Neighborhood_Blueste, Neighborhood_BrDale, Neighborhood_BrkSide, Neighborhood_ClearCr, Neighborhood_CollgCr, Neighborhood_Crawfor, Neighborhood_Edwards, Neighborhood_Gilbert, Neighborhood_IDOTRR, Neighborhood_MeadowV, Neighborhood_Mitchel, Neighborhood_NAmes, Neighborhood

In [19]:
# 3 Stepwise Selection results:
print(best_variables)
regressionSummary(y_test, best_model.predict(x_test[best_variables]))

['OverallQual', 'GarageCars', 'Condition2_PosN', 'PoolQC_Gd', 'SaleType_New', 'MSSubClass', 'BsmtExposure_Gd', 'Neighborhood_NridgHt', 'Neighborhood_StoneBr', 'Neighborhood_NoRidge', 'YearBuilt', 'OverallCond', 'RoofMatl_WdShngl', 'LotArea', 'Neighborhood_Crawfor', 'BsmtQual_Gd', 'BsmtQual_TA', 'Condition1_Norm', 'RoofMatl_CompShg', 'PoolArea', 'PoolQC_Fa', 'TotalBsmtSF', 'Functional_Typ', 'LotConfig_CulDSac', 'KitchenQual_Gd', 'KitchenQual_TA', 'KitchenQual_Fa', 'SaleCondition_Normal', 'BsmtExposure_No', 'RoofMatl_WdShake', 'RoofMatl_Tar&Grv', 'RoofMatl_Membran', 'RoofMatl_Roll', 'Neighborhood_BrkSide', '2ndFlrSF', 'LandContour_Low', 'Exterior1st_BrkFace', 'BedroomAbvGr', 'BsmtQual_Fa', 'Condition2_PosA', 'LotShape_IR2', 'MasVnrArea', 'WoodDeckSF', 'ScreenPorch', 'GarageYrBlt', 'BsmtFinType1_GLQ', '1stFlrSF', 'MasVnrType_BrkFace', 'Foundation_Wood', 'KitchenAbvGr', 'BldgType_2fmCon', 'Street_Pave', 'TotRmsAbvGrd', 'HouseStyle_SLvl', 'Exterior2nd_Stone', 'Exterior2nd_BrkFace', 'MSZonin

# Exhaustive Search
#train
s=linear.fit(x_train,y_train)  
def score_model(model,):
    pred_y = model.predict(x_test)
    # we negate as score is optimized to be as low as possible
    return -adjusted_r2_score(y_test, pred_y,model)

allVariables = ds.drop(predict,axis=1).columns
def train_model(allVariables):
    model = linear_model.LinearRegression()
    model.fit(x_train, y_train)
    return model

def score_model(model,allVariables):
    pred_y = model.predict(x_test)
    # we negate as score is optimized to be as low as possible
    return -adjusted_r2_score(y_test, pred_y, model)


results = exhaustive_search(allVariables, train_model, score_model)
results

data = []
for result in results:
    model = result['model']
    allV = list(result['allV'])
    AIC = AIC_score(y_test, model.predict(x_test), model)
    d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}
    d.update({var: var in result['allV'] for var in allVariables})
    data.append(d)
pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables)))

