In [138]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [139]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from mlxtend.regressor import StackingRegressor
from collections import defaultdict
import math
import re

# Load the data

In [140]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [141]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [142]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [143]:
train_df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


# Check categorical columns

In [144]:
def get_column_nans(df): 
    # Get columns and their nan values
    column_nans = {}
    for column in df.columns:
        num_nans = df[column].isnull().values.sum()
        if num_nans > 0:
            column_nans[column] = num_nans
    return column_nans


def print_columns_with_nans(df, name):
    # Print columns that have nan values
    columns_with_nans = get_column_nans(df)
    print("{}".format(name))
    print("This dataframe has {} columns with nans".format(len(columns_with_nans.keys())))
    for column, num_nans in columns_with_nans.items():
        print("- {} (type {}): {} nans".format(column, df[column].dtype, num_nans))
        

In [145]:
print_columns_with_nans(train_df, name="train_df")
print("\n")
print_columns_with_nans(test_df, name="test_df")

train_df
This dataframe has 19 columns with nans
- MasVnrArea (type float64): 8 nans
- PoolQC (type object): 1453 nans
- MasVnrType (type object): 8 nans
- GarageType (type object): 81 nans
- BsmtCond (type object): 37 nans
- LotFrontage (type float64): 259 nans
- MiscFeature (type object): 1406 nans
- GarageFinish (type object): 81 nans
- GarageYrBlt (type float64): 81 nans
- Alley (type object): 1369 nans
- GarageQual (type object): 81 nans
- Electrical (type object): 1 nans
- Fence (type object): 1179 nans
- BsmtFinType2 (type object): 38 nans
- BsmtQual (type object): 37 nans
- FireplaceQu (type object): 690 nans
- BsmtExposure (type object): 38 nans
- BsmtFinType1 (type object): 37 nans
- GarageCond (type object): 81 nans


test_df
This dataframe has 33 columns with nans
- MasVnrType (type object): 16 nans
- BsmtFullBath (type float64): 2 nans
- GarageQual (type object): 78 nans
- Exterior1st (type object): 1 nans
- GarageFinish (type object): 78 nans
- GarageYrBlt (type float64):

In [146]:

def fill_nas(df):
    columns_with_nas = (
        "Alley",
        "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
        "FireplaceQu", 
        "GarageType", "GarageYrBlt", 'GarageFinish', 'GarageQual', 'GarageCond',
        "PoolQC",
        "Fence",
        "MiscFeature",
        "MasVnrType"
    ) 
    for column in columns_with_nas:
        df[column] = df[column].fillna("None")

    # If there is no masonry veneer area, there is 0 square feet
    df["MasVnrArea"] = df["MasVnrArea"].fillna(0)
    
    # If there is no basement, there is no basement area square feet
    df["BsmtFinSF1"] = df["BsmtFinSF1"].fillna(0)
    df["BsmtFinSF2"] = df["BsmtFinSF2"].fillna(0)
    df["BsmtUnfSF"] = df["BsmtUnfSF"].fillna(0)
    df["TotalBsmtSF"] = df["TotalBsmtSF"].fillna(0)
    df["BsmtFullBath"] = df["BsmtFullBath"].fillna(0)
    df["BsmtHalfBath"] = df["BsmtFullBath"].fillna(0)
    
    # If there is no information about the number of cards, it is because there is no garage
    df["GarageCars"] = df["GarageCars"].fillna(0)
    df["GarageArea"] = df["GarageArea"].fillna(0)
    
    # Big assumption: you can't sell a house without electricity installation
    df["Utilities"] = df["Utilities"].fillna("ELO")

    # Set the nans lot frontages as the   
    neighborhoods = df["Neighborhood"].unique()
    median_lot_frontage_by_neighborhood = {}
    for neighborhood in neighborhoods:
        median_lot_frontage_by_neighborhood[neighborhood] = df[df.Neighborhood == neighborhood].LotFrontage.median() 

    df["LotFrontage"] =[
        median_lot_frontage_by_neighborhood[row.Neighborhood] if np.isnan(row.LotFrontage) else row.LotFrontage for _, row in df.iterrows()
    ]
    
    
def encode_quality_columns(df):
    categorical_quality_columns = [
        "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC", "KitchenQual",
        "FireplaceQu", "GarageQual", "GarageCond"
    ]
    quality_labels= {
        "Ex": 5,
        "Gd": 4,
        "TA": 3,
        "Fa": 2,
        "Po": 1,
        "None": 0
    }
    _encode_columns(quality_labels, categorical_quality_columns, df)

    
def encode_pool_quality(df):
    columns = [
        "PoolQC"
    ]
    pool_quality_labels= {
        "Ex": 4,
        "Gd": 3,
        "TA": 2,
        "Fa": 1,
        "None": 0
    }
    _encode_columns(pool_quality_labels, columns, df)
    
    
def encode_basement_rating_columns(df):
    categorical_basement_rating_columns = [
        "BsmtFinType1", "BsmtFinType2"
    ]
    basement_rating_label = {
        "GLQ": 6,
        "ALQ": 5,
        "BLQ": 4,
        "Rec": 3,
        "LwQ": 2,
        "Unf": 1,
        "None": 0
    }
    _encode_columns(basement_rating_label, categorical_basement_rating_columns, df)


def encode_garage_finish_column(df):
    garage_finish_columns = [
        "GarageFinish"
    ]
    garage_finish_label= {
        "Fin": 3,
        "RFn": 2,
        "Unf": 1,
        "None": 0
    }
    _encode_columns(garage_finish_label, garage_finish_columns, df)

    
def encode_utilities(df):
    columns = [
        "Utilities"
    ]
    utilities_labels= {
        "AllPub": 3,
        "NoSewr": 2,
        "NoSeWa": 1,
        "ELO": 0
    }
    _encode_columns(utilities_labels, columns, df)

    
def encode_central_air(df):
    encoding = {
        "N": 0,
        "Y": 1
    }
    _encode_columns(label_encoding_correspondence=encoding, columns=["CentralAir"], df=df)


def _encode_columns(label_encoding_correspondence, columns, df):
    for column in columns:
        df[column] = df[column].map(lambda cell: label_encoding_correspondence.get(cell, 0))
        df[column] = df[column].astype(int)


def cast_types(df):
    df["MSSubClass"] = df["MSSubClass"].astype(str)
    df["OverallQual"] = df["OverallQual"].astype(int)
    df["OverallCond"] = df["OverallCond"].astype(int)
    df["MoSold"] = df["MoSold"].astype(str)
    
    
def transform_columns(df):
    fill_nas(df)
    encode_quality_columns(df)
    encode_pool_quality(df)
    encode_basement_rating_columns(df)
    encode_garage_finish_column(df)
    encode_central_air(df)
    cast_types(df)
    



In [147]:
transform_columns(train_df)
transform_columns(test_df)

In [148]:
print_columns_with_nans(train_df, name="train_df")
print("\n")
print_columns_with_nans(test_df, name="test_df")

train_df
This dataframe has 1 columns with nans
- Electrical (type object): 1 nans


test_df
This dataframe has 5 columns with nans
- Exterior1st (type object): 1 nans
- Functional (type object): 2 nans
- Exterior2nd (type object): 1 nans
- SaleType (type object): 1 nans
- MSZoning (type object): 4 nans


In [149]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null object
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 no

In [150]:
def get_categorical_columns(df):
    categorical_columns = []
    for column in df.columns:
        if df[column].dtype == "object":
            categorical_columns.append(column)
    return categorical_columns

In [151]:
categorical_columns_train_df = get_categorical_columns(train_df)
print("Categorical columns in train_df ({}): {}".format(len(categorical_columns_train_df), categorical_columns_train_df))

categorical_columns_test_df = get_categorical_columns(test_df)
print("Categorical columns in test_df ({}): {}".format(len(categorical_columns_test_df), categorical_columns_test_df))

Categorical columns in train_df (32): ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtExposure', 'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageYrBlt', 'PavedDrive', 'Fence', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition']
Categorical columns in test_df (32): ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtExposure', 'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageYrBlt', 'PavedDrive', 'Fence', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition']


In [152]:
# Fill the categorical columns, creating dummy (1/0) columns
expanded_train_df = pd.get_dummies(train_df, dummy_na=True)
expanded_test_df = pd.get_dummies(test_df, dummy_na=True)

In [153]:
# In case any column of test is not present in train, set it to zero
all_columns = set(expanded_train_df.columns).union(set(expanded_test_df.columns)) - set(["SalePrice"])
for column in all_columns:
    if column not in expanded_train_df.columns:
        expanded_train_df[column] = 0
    if column not in expanded_test_df.columns:
        expanded_test_df[column] = 0

In [154]:
# For each NAN fill it to the median value of that column (this shouldn't happen)
if expanded_train_df.isnull().values.any():
    ready_train_df = expanded_train_df.fillna(expanded_train_df.median())
    print("NANs found in transformed training dataset!")
else:
    print("No NANs found in transformed training dataset!")
    
if expanded_test_df.isnull().values.any():
    ready_test_df = expanded_test_df.fillna(expanded_test_df.median())
    print("NANs found in transformed test dataset!")
else:
    print("No NANs found in transformed test dataset!")

No NANs found in transformed training dataset!
No NANs found in transformed test dataset!


# New attributes

In [155]:
def add_new_features(df):
    pass
    # Built area in sq. feet: LotArea - 1stFlrSF
    df["BuiltAreaSF"] = df["LotArea"] - df["1stFlrSF"]
    # Total home area: 1stFlrSF + 2stFlSF + TotalBsmtSF
    df["TotalHomeAreaSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"]

In [156]:
# Create the new features for both data sets
add_new_features(expanded_train_df)
add_new_features(expanded_test_df)

In [157]:
ready_train_df = expanded_train_df
ready_test_df = expanded_test_df

# Correlations

In [158]:
correlation_matrix = ready_train_df.corr()
correlation_values = correlation_matrix["SalePrice"].sort_values(ascending=False)
print(correlation_values)

SalePrice                1.000000
OverallQual              0.790982
TotalHomeAreaSF          0.782260
GrLivArea                0.708624
ExterQual                0.682639
KitchenQual              0.659600
GarageCars               0.640409
GarageArea               0.623431
TotalBsmtSF              0.613581
1stFlrSF                 0.605852
BsmtQual                 0.585207
FullBath                 0.560664
GarageFinish             0.549247
TotRmsAbvGrd             0.533723
YearBuilt                0.522897
FireplaceQu              0.520438
YearRemodAdd             0.507101
Foundation_PConc         0.497734
MasVnrArea               0.472614
Fireplaces               0.466929
HeatingQC                0.427649
Neighborhood_NridgHt     0.402149
BsmtFinSF1               0.386420
MSSubClass_60            0.377197
SaleType_New             0.357509
SaleCondition_Partial    0.352060
LotFrontage              0.349876
GarageType_Attchd        0.335961
MasVnrType_Stone         0.330476
Neighborhood_N

In [159]:
feature_correlation_pairs = []
for feature, value in correlation_values.items():
    feature_correlation_pairs.append((feature, abs(value)))
    
sorted_feature_correlation_pairs = sorted(feature_correlation_pairs, key=lambda pair: pair[1], reverse=True)    

most_correlated_features = [
    feature_correlation_pair[0]
    for feature_correlation_pair in sorted_feature_correlation_pairs 
]
for sorted_feature_correlation_pair in sorted_feature_correlation_pairs:
    print(sorted_feature_correlation_pair)

('SalePrice', 1.0)
('OverallQual', 0.7909816005838047)
('TotalHomeAreaSF', 0.7822600527979842)
('GrLivArea', 0.7086244776126511)
('ExterQual', 0.6826392416562591)
('KitchenQual', 0.6595997207286572)
('GarageCars', 0.640409197258349)
('GarageArea', 0.6234314389183598)
('TotalBsmtSF', 0.6135805515591944)
('1stFlrSF', 0.6058521846919166)
('BsmtQual', 0.5852071991725148)
('FullBath', 0.5606637627484452)
('GarageFinish', 0.5492467563332154)
('TotRmsAbvGrd', 0.5337231555820238)
('YearBuilt', 0.5228973328794967)
('FireplaceQu', 0.5204376059504007)
('YearRemodAdd', 0.5071009671113867)
('Foundation_PConc', 0.4977337525869438)
('MasVnrArea', 0.47261449900457725)
('Fireplaces', 0.4669288367515242)
('HeatingQC', 0.4276487073988049)
('Neighborhood_NridgHt', 0.40214859817526666)
('BsmtFinSF1', 0.38641980624215627)
('MSSubClass_60', 0.37719706842810213)
('MasVnrType_None', 0.367456365193253)
('SaleType_New', 0.35750940508319695)
('GarageType_Detchd', 0.3541407884127012)
('SaleCondition_Partial', 0.35

In [176]:
number_of_best_features_to_keep = 230

def drop_worst_features(df, most_correlated_features):
    # First most correlated feature is allways SalePrice, so we have to add 1 to the selection
    # (we will drop SalePrice later)
    selected_features_to_drop = most_correlated_features[number_of_best_features_to_keep+1:]
    print("Droping {} columns".format(len(selected_features_to_drop)))
    resultant_df = df.drop(selected_features_to_drop, axis=1)
    if "Id" in list(resultant_df.columns):
        resultant_df = resultant_df.drop("Id", axis=1)
    return resultant_df

final_train_df = drop_worst_features(ready_train_df, most_correlated_features)
final_test_df = drop_worst_features(ready_test_df, most_correlated_features)

print("{} selected columns: {}".format(len(final_train_df.columns), list(final_test_df.columns)))

Droping 181 columns
Droping 181 columns
231 selected columns: ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'MSSubClass_120', 'MSSubClass_160', 'MSSubClass_180', 'MSSubClass_190', 'MSSubClass_20', 'MSSubClass_30', 'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_85', 'MSSubClass_90', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_Grvl', 'Alley_None', 'LotShape_IR1', 'LotShape_IR2', 'L

In [162]:
final_train_df.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,BsmtCond,...,SaleType_ConLD,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,BuiltAreaSF,TotalHomeAreaSF
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,70.199658,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,3.39589,3.489041,2.934932,...,0.006164,0.083562,0.867808,0.069178,0.00274,0.013699,0.820548,0.085616,9354.20137,2567.04863
std,22.431902,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,0.57428,0.876478,0.552159,...,0.078298,0.276824,0.338815,0.253844,0.052289,0.116277,0.383862,0.279893,9872.384401,821.714421
min,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,735.0,334.0
25%,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,3.0,3.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,6476.0,2009.5
50%,70.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,3.0,4.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,8317.0,2474.0
75%,80.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,4.0,4.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,10312.75,3004.0
max,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,213209.0,11752.0


In [163]:
final_test_df.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,BsmtCond,...,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Exterior2nd_Other,BuiltAreaSF,TotalHomeAreaSF
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,68.955106,9819.161069,6.078821,5.553804,1971.357779,1983.662783,99.673749,3.397533,3.466073,2.901988,...,0.080192,0.862234,0.061001,0.005483,0.01782,0.825223,0.082248,0.0,8662.626456,2527.903358
std,20.999091,4955.517327,1.436812,1.11374,30.390071,21.130467,177.001792,0.586444,0.933697,0.596621,...,0.271683,0.344772,0.239414,0.073871,0.132344,0.379907,0.274837,0.0,4786.915262,787.961712
min,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,820.0,612.0
25%,60.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,3.0,3.0,3.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,6340.0,1985.0
50%,70.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,3.0,3.0,3.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8296.0,2428.0
75%,80.0,11517.5,7.0,6.0,2001.0,2004.0,162.0,4.0,4.0,3.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,10205.5,2976.0
max,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,5.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,55450.0,10190.0


In [164]:
# Prepare input data to regressors
y = final_train_df["SalePrice"].values

X = final_train_df.drop("SalePrice", axis=1).values

test_x = final_test_df.values

In [165]:
print y.shape
print X.shape
print test_x.shape

(1460,)
(1460, 220)
(1459, 220)


In [166]:
print("Checking there is no NANs or infinity values")

print("Checking if X is OK")
print(not np.any(np.isnan(X)))
print(np.all(np.isfinite(X)))

print("Checking test_x is OK")
print(not np.any(np.isnan(test_x)))
print(np.all(np.isfinite(test_x)))


Checking there is no NANs or infinity values
Checking if X is OK
True
True
Checking test_x is OK
True
True


In [167]:
# Usefull runner
class RegressorRunner(object):
    
    def __init__(self, pipeline, parameters, cv=5, debug=True):
        self.pipeline = pipeline
        self.parameters = parameters
        self.grid_search = GridSearchCV(self.pipeline, self.parameters, cv=cv)
        self.debug = debug
        self.prediction = None
        self.X_train = None
        self.y_train = None
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.grid_search.fit(X, y)

    @property
    def best_params(self):
        return self.grid_search.best_params_
    
    @property
    def best_estimator(self):
        return self.grid_search.best_estimator_
    
    def get_scores(self, X, y, num_folds=5):
        scores = cross_val_score(self.grid_search.best_estimator_, X, y, cv=num_folds)
        return scores
    
    @property
    def feature_importances(self):
        classifier_step_index = 0
        for step_name, step_process in self.grid_search.best_estimator_.steps:
            if step_name == "regressor":
                break
            classifier_step_index += 1
        feature_importances = self.grid_search.best_estimator_.steps[classifier_step_index][1].feature_importances_
        return sorted(zip(feature_importances, selected_features), reverse=True)
    
    def predict(self, X_test):
        if self.prediction is None:
            self.prediction = self.grid_search.predict(X_test)
        return self.prediction
    
    def mean_squared_error(self, X, y):
        y_predicted = self.grid_search.predict(X)
        return mean_squared_error(y, y_predicted)

    def apply_predicition_to_df(self, X_test, test_df, output_filename, estimator_for_negatives=None):
        if self.prediction is None:
            self.predict(X_test)
        # Add the prediction to the test dataset
        estimated_test_df = test_df.assign(SalePrice=list(self.prediction))
        
        # Count nevative prices
        if self.debug:
            number_of_negative_prices = 0
            for i, row in estimated_test_df.iterrows():
                    if row["SalePrice"] <= 0:
                        number_of_negative_prices += 1
            print("{} houses have negative prices of {}".format(number_of_negative_prices, estimated_test_df.shape[0]))
        
        # IMPORTANT PATCH: NO SALE PRICE MUST BE NEGATIVE
        
        #estimated_test_df = estimated_test_df.apply(
        #    lambda row: row.SalePrice if row.SalePrice > 0 else estimated_test_df[row.Neighborhood]["SalePrice"].median() 
        #)
        
        # In case there is any negative SalePrice, set it to 0
        estimated_test_df["SalePrice"] = estimated_test_df["SalePrice"].map(
            lambda sale_price: np.nan if sale_price < 0 else sale_price
        )

        estimated_test_df["SalePrice"].fillna(estimated_test_df["SalePrice"].median(), inplace=True)

        # Save 
        estimated_test_df.to_csv(output_filename, columns=["Id", "SalePrice"], index=False)

# Run the regressors

In [168]:
rmses = {}

## KNN

In [169]:
# KNN regressor
pipeline = Pipeline([
    ("regressor", KNeighborsRegressor())
])

parameters = { 
    'regressor__n_neighbors': [3, 5, 7, 10],
    'regressor__weights': ["uniform", "distance"],
    'regressor__algorithm': ["auto", "ball_tree", "kd_tree", "brute"],
    'regressor__n_jobs': [-1]
}


best_parameters = {'regressor__algorithm': ['auto'], 'regressor__n_jobs': [-1], 'regressor__weights': ['distance'], 'regressor__n_neighbors': [7]}
knn_runner = RegressorRunner(pipeline=pipeline, parameters=best_parameters)

knn_runner.fit(X, y)

print ("Best parameters found for KNN regression: ")
print (knn_runner.best_params)

knn_rmse = knn_runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(knn_rmse))

rmses["knn"] = knn_rmse

knn_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_knn.csv")


Best parameters found for KNN regression: 
{'regressor__algorithm': 'auto', 'regressor__n_jobs': -1, 'regressor__weights': 'distance', 'regressor__n_neighbors': 7}
RMSE of training data 0.0
0 houses have negative prices of 1459


## Linear methods

In [170]:
# Linear regressor
pipeline = Pipeline([
    ("regressor", LinearRegression())
])

parameters = { 
    'regressor__fit_intercept': [True, False],
    'regressor__n_jobs': [-1]
}

linear_runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

linear_runner.fit(X, y)

print ("Best parameters found for Linear regression: ")
print (linear_runner.best_params)

linear_rmse = linear_runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(linear_rmse))

rmses["linear"] = linear_rmse

linear_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_ln.csv")

Best parameters found for Linear regression: 
{'regressor__fit_intercept': False, 'regressor__n_jobs': -1}
RMSE of training data 628556978.201
130 houses have negative prices of 1459


In [171]:
# Ridge regressor
pipeline = Pipeline([
    ("regressor", Ridge())
])

parameters = { 
    'regressor__alpha': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'regressor__fit_intercept': [True],
    'regressor__normalize': [True],
    'regressor__solver': ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"],
    'regressor__random_state': [1, 2, 3, 4, 5]
}

ridge_runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

ridge_runner.fit(X, y)

print ("Best parameters found for Ridge: ")
print (ridge_runner.best_params)

ridge_rmse = ridge_runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(ridge_rmse))

rmses["ridge"] = ridge_rmse

ridge_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_ridge.csv")

Best parameters found for Ridge: 
{'regressor__fit_intercept': True, 'regressor__solver': 'sag', 'regressor__random_state': 2, 'regressor__alpha': 0.5, 'regressor__normalize': True}
RMSE of training data 771974460.754
0 houses have negative prices of 1459


In [172]:
# LASSO regressor
pipeline = Pipeline([
    ("regressor", Lasso())
])

parameters = { 
    'regressor__alpha': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'regressor__fit_intercept': [True],
    'regressor__normalize': [True],
    'regressor__selection': ["random"],
    'regressor__random_state': [1, 2, 3, 4, 5]
}

lasso_runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

lasso_runner.fit(X, y)

print ("Best parameters found for Lasso: ")
print (lasso_runner.best_params)

lasso_rmse = lasso_runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(lasso_rmse))

rmses["lasso"] = lasso_rmse

lasso_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_lasso.csv")

Best parameters found for Lasso: 
{'regressor__fit_intercept': True, 'regressor__selection': 'random', 'regressor__random_state': 3, 'regressor__alpha': 10, 'regressor__normalize': True}
RMSE of training data 674009505.995
0 houses have negative prices of 1459


In [173]:
# ElasticNet regressor
pipeline = Pipeline([
    ("regressor", ElasticNet())
])

parameters = { 
    'regressor__alpha': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'regressor__fit_intercept': [True],
    'regressor__normalize': [True],
    'regressor__selection': ["random"],
    'regressor__random_state': [1, 2, 3, 4, 5]
}

elasticnet_runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

elasticnet_runner.fit(X, y)

print ("Best parameters found for ElasticNet: ")
print (elasticnet_runner.best_params)

elasticnet_rmse = elasticnet_runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(elasticnet_rmse))

rmses["elasticnet"] = elasticnet_rmse

elasticnet_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_elasticnet.csv")

Best parameters found for ElasticNet: 
{'regressor__fit_intercept': True, 'regressor__selection': 'random', 'regressor__random_state': 1, 'regressor__alpha': 0.5, 'regressor__normalize': True}
RMSE of training data 5900759332.87
0 houses have negative prices of 1459


In [174]:
# Get the best linear method with knn as base
import operator
sorted(rmses.items(), key=operator.itemgetter(1))

[('knn', 0.0),
 ('linear', 628556978.20088589),
 ('lasso', 674009505.99494207),
 ('ridge', 771974460.75432837),
 ('elasticnet', 5900759332.8737917)]

## Other methods

In [None]:
# Gradient boost regressor
pipeline = Pipeline([
    ("regressor", GradientBoostingRegressor())
])

parameters = { 
    'regressor__loss': ["ls", "lad", "huber", "quantile"],
    'regressor__learning_rate': [0.1],
    'regressor__n_estimators': [100, 200],
    'regressor__criterion': ["mse", "mae", "friedman_mse"],
    'regressor__random_state': [1, 2, 3, 4, 5],
    'regressor__max_depth': [100, 300, 500, 1000],
    'regressor__max_features': ['sqrt', 'auto', 'log2', None],
    'regressor__min_samples_split': [2, 3, 10],
    'regressor__min_samples_leaf': [1, 3, 10],
    'regressor__presort': [True, False]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for Gradient boost: ")
print (runner.best_params)

rmse = runner.mean_squared_error(X, y)
print("RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_gradient_boost.csv")

In [None]:
# Polynomial regressor
for degree in [2, 3]:
    print("Polynomial regression {}: ".format(degree))
    pipeline = make_pipeline(PolynomialFeatures(degree), Ridge())
    runner = RegressorRunner(pipeline=pipeline, parameters={})

    runner.fit(X, y)

    print ("- Best parameters found for polynomial regression {}: {}".format(degree, runner.best_params))

    rmse = runner.mean_squared_error(X, y)
    print("- RMSE of training data {}".format(rmse))
    
    runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_poly_{}.csv".format(degree))

    print("")

In [None]:
# Defision tree regressor
pipeline = Pipeline([
    ("regressor", DecisionTreeRegressor())
])

parameters = { 
    'regressor__criterion': ["mse", "mae", "friedman_mse"],
    'regressor__random_state': [1, 2, 3, 4, 5],
    'regressor__max_depth': [100, 300, 500, 1000],
    'regressor__max_features': ['sqrt', 'auto', 'log2', None],
    'regressor__min_samples_split': [2, 3, 10],
    'regressor__min_samples_leaf': [1, 3, 10],
    'regressor__presort': [True, False]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for Decision Tree regression: ")
print (runner.best_params)

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_dt.csv")

In [None]:
# Gaussian Process regressor
pipeline = Pipeline([
    ("regressor", GaussianProcessRegressor())
])


parameters = {
    'regressor__random_state': [1,2,3,4,5]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for Gaussian Process regression: ")
print (runner.best_params)

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_gp.csv")

In [None]:
# Random forest regressor
pipeline = Pipeline([
    ("regressor", RandomForestRegressor())
])

best_parameters = {
    'regressor__n_estimators': [100], 
    'regressor__criterion': ["mae"],
    'regressor__max_features': ['sqrt'],
    'regressor__min_samples_split': [3],
    'regressor__min_samples_leaf': [1],
    'regressor__bootstrap': [False],
    'regressor__n_jobs': [-1]
}

parameters = {
    'regressor__n_estimators': [10, 20, 30, 40, 100], 
    'regressor__criterion': ["mse", "mae"],
    'regressor__max_features': ['sqrt', 'auto', 'log2', None],
    'regressor__min_samples_split': [2, 3, 10],
    'regressor__min_samples_leaf': [1, 3, 10],
    'regressor__bootstrap': [True, False],
    'regressor__n_jobs': [-1]
}

runner = RegressorRunner(pipeline=pipeline, parameters=best_parameters)

runner.fit(X, y)

print ("Best parameters found for RF regression: ")
print (runner.best_params)

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_rf.csv")

In [30]:
# MLPRegressor
mlp_pipeline = Pipeline([
    ("regressor", MLPRegressor())
])

mlp_parameters = {
    'regressor__hidden_layer_sizes': [(300, 300), (500, 500)], 
    'regressor__alpha': [0.1, 0.5, 1, 2, 3, 3.5, 4, 5],
    'regressor__activation': ['identity', 'relu', 'logistic', 'tanh'],
    'regressor__solver': ["lbfgs", "sgd", "adam"]
}

mlp_runner = RegressorRunner(pipeline=mlp_pipeline, parameters=mlp_parameters)

mlp_runner.fit(X, y)

print ("Best parameters found for MLP regression: ")
print (mlp_runner.best_params)

rmse = mlp_runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

mlp_runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_mlp.csv")



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# SVM regressor
pipeline = Pipeline([
    ("regressor", SVR())
])

parameters = {
    'regressor__C': [1, 2], 
    'regressor__epsilon': [0.1, 0.05],
    'regressor__kernel': ['rbf', 'linear', 'poly', "sigmoid", "precomputed"]
}

runner = RegressorRunner(pipeline=pipeline, parameters=parameters)

runner.fit(X, y)

print ("Best parameters found for SVM regression: ")
print (runner.best_params)

rmse = runner.mean_squared_error(X, y)
print("- RMSE of training data {}".format(rmse))

runner.apply_predicition_to_df(test_x, test_df, output_filename="results/test_estimated_with_SVM.csv")

## Ensembled method

In [177]:
# Ensembleb inspired by https://www.kaggle.com/vhrique/simple-house-price-prediction-stacking

# Initialize models with gridsearch resultant hyperparameters
lr = LinearRegression(
    fit_intercept=False,
    n_jobs = -1
)

#rd = Ridge(
#    fit_intercept=False, solver='auto', random_state=1, alpha=2.0, normalize=True
#)

ls = Lasso(
    fit_intercept=True, selection="random", random_state=3, alpha=10, normalize=True
)

rf = RandomForestRegressor(
    max_depth=3,
    n_jobs = -1,
    min_samples_split=3, bootstrap=False,
    max_features='sqrt', min_samples_leaf=1, n_estimators=100,
    criterion='mae'
)

gb = GradientBoostingRegressor(
    n_estimators = 40,
    max_depth = 2
)

nn = MLPRegressor(
    hidden_layer_sizes = (300, 300),
    alpha = 0.5,
    activation="relu",
    solver="lbfgs"
)

# Initialize Ensemble
stacking_model = StackingRegressor(
    regressors=[rf, gb, nn, ls],
    meta_regressor=lr
)

# Fit the model on our data
stacking_model.fit(X, y)

training_y_prediction = stacking_model.predict(X)
mse = mean_squared_error(y, training_y_prediction)

print("- RMSE of training data {}".format(mse))

test_y_prediction = stacking_model.predict(test_x)
estimated_test_df = test_df.assign(SalePrice=list(test_y_prediction))

# Count nevative prices
number_of_negative_prices = 0
for i, row in estimated_test_df.iterrows():
        if row["SalePrice"] <= 0:
            number_of_negative_prices += 1
print("{} houses have negative prices of {}".format(number_of_negative_prices, estimated_test_df.shape[0]))
        
# In case there is any negative SalePrice, set it to 0
estimated_test_df["SalePrice"] = estimated_test_df["SalePrice"].map(
    lambda sale_price: np.nan if sale_price < 0 else sale_price
)

estimated_test_df["SalePrice"].fillna(estimated_test_df["SalePrice"].median(), inplace=True)

# Save 
estimated_test_df.to_csv("results/test_estimated_with_stacking_model.csv", columns=["Id", "SalePrice"], index=False)

- RMSE of training data 458375972.917
0 houses have negative prices of 1459
