In [352]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [353]:
# import dataset
df = pd.read_csv("Housing_Prices_Modeling.csv")
print(df.shape)
df.head()

(1198, 97)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,total_sqft,outdoor_living_sqft,Alley_simple_dummy,MasVnrType_simple_dummy,BsmtQual_simple_dummy,FireplaceQu_simple_dummy,GarageType_simple_dummy,PoolQC_simple_dummy,Fence_simple_dummy,MiscFeature_simple_dummy
0,0.0,2-STORY 1946 & NEWER,RL,0.207668,0.03342,Pave,Missing,Reg,Lvl,AllPub,...,0.347339,0.059396,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.000685,1-STORY 1946 & NEWER ALL STYLES,RL,0.255591,0.038795,Pave,Missing,Reg,Lvl,AllPub,...,0.340803,0.290166,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,0.001371,2-STORY 1946 & NEWER,RL,0.217252,0.046507,Pave,Missing,IR1,Lvl,AllPub,...,0.369125,0.040896,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
3,0.002742,2-STORY 1946 & NEWER,RL,0.268371,0.060576,Pave,Missing,IR1,Lvl,AllPub,...,0.468254,0.268744,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
4,0.003427,1-1/2 STORY FINISHED ALL AGES,RL,0.271565,0.059899,Pave,Missing,IR1,Lvl,AllPub,...,0.283847,0.379747,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [354]:
# import features transfer object
transfer_info_json = open("feature_lists_updated.json", 'r')
transfer_info_json = transfer_info_json.read()
transfer_info = json.loads(transfer_info_json)
print(transfer_info_json)

{
    "fullList": [
        "Id",
        "MSSubClass",
        "MSZoning",
        "LotFrontage",
        "LotArea",
        "Street",
        "Alley",
        "LotShape",
        "LandContour",
        "Utilities",
        "LotConfig",
        "LandSlope",
        "Neighborhood",
        "Condition1",
        "Condition2",
        "BldgType",
        "HouseStyle",
        "OverallQual",
        "OverallCond",
        "YearBuilt",
        "YearRemodAdd",
        "RoofStyle",
        "RoofMatl",
        "Exterior1st",
        "Exterior2nd",
        "MasVnrType",
        "MasVnrArea",
        "ExterQual",
        "ExterCond",
        "Foundation",
        "BsmtQual",
        "BsmtCond",
        "BsmtExposure",
        "BsmtFinType1",
        "BsmtFinSF1",
        "BsmtFinType2",
        "BsmtFinSF2",
        "BsmtUnfSF",
        "TotalBsmtSF",
        "Heating",
        "HeatingQC",
        "CentralAir",
        "Electrical",
        "1stFlrSF",
        "2ndFlrSF",
        "LowQualFinSF

In [355]:
def columns_to_remove(dont_remove=[]):
    """
    this function generates of features to remove and takes a list of features not to remove.
    If the list is empty no features are removed.
    """
    remove_columns = []

    for key in transfer_info.keys():
        if 'Group' in key:
            for second_key in transfer_info[key].keys():
                if 'reference' in second_key:
                    ref = second_key
                else:
                    new = second_key
            
            remove_columns += transfer_info[key][new]

    for i in dont_remove:
        if i in remove_columns:
            remove_columns.remove(i)
        
        
    return remove_columns

In [356]:
def model_prep(model_df):
    """
    This function prepares a dataframe for modeling.
    It will evaluate which options are submitted and clean the data accordingly.
    It will then convert the categorical features to dummies.  
    Finally it will generate a train/test split of the data.
    """
    # convert to string so these features do not get concerted to dummies
    model_df['MoSold'] = model_df['MoSold'].astype(str)
    model_df['YrSold'] = model_df['YrSold'].astype(str)

    model_df.dropna(inplace=True)
    dummies = pd.get_dummies(model_df, drop_first=False)

    
    # splitting on the year the home was sold because it doesn't make sense for the model to learn from home sales
    # in 2010 and hold out sales in previous years.  The assumption here is buyers & sellers look at historical data to 
    # inform the purchase/selling price
    train = dummies[dummies['YrSold_2010'] == 0]
    test = dummies[dummies['YrSold_2010'] == 1]

    y_train = train['SalePrice']
    y_test  = test['SalePrice']
    
    X_train = train.drop(columns=['SalePrice'])
    X_test = test.drop(columns=['SalePrice'])

    return X_train, X_test, y_train, y_test

In [357]:
def run_models(X_train, X_test, y_train, y_test, algo):
    """
    This function takes the train/test split data as well as a algorithm to train on.
    Then the function tests the model with the provided test data.
    """
    results = algo["model"].fit(X_train, y_train)

    y_pred = results.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)

    return pd.DataFrame({"Model Name": [algo["name"]],
                        "MAE": [mae]})

In [358]:
def process(df):
    """
    This function loops there the algoithms, whether to normalize the data, keep only "normal" sales or not,
    and whether to keep only residential homes or not.
    It takes model results, adds what options were submitted and a feature count, then returns a dataframe
    """
    algos = [{
            "name": "Linear Regression",
            "model": LinearRegression()
            },
            {
            "name": "Random Forset",
            "model": RandomForestRegressor()
            },
             {
            "name": "Gradient Boosting",
            "model": GradientBoostingRegressor()
            },
            {
            "name": "Stochastic Gradient Descent",
             "model": SGDRegressor()
            },
            {
            "name": "Support Vector Machine",
            "model": LinearSVR()
            }
              ]

    data_results = pd.DataFrame()
    
    for algo in algos:
        print(algo)
        X_train, X_test, y_train, y_test = model_prep(df)

        resutls_df = run_models(X_train, X_test, y_train, y_test, algo)

        resutls_df['Feature Amount'] = X_train.shape[1]

        data_results = pd.concat([data_results, resutls_df])
    
    return data_results

In [359]:
# The BIG LOOP
# loop through all testing options and gather results

all_results = pd.DataFrame()

for key in transfer_info.keys():
    print(key)
    if key == 'fullList':
        model_df = df[transfer_info[key]]
        results = process(model_df)
    elif key == 'qualities':
        remove = columns_to_remove() + transfer_info[key]
        model_df = df.drop(columns=remove)
        results = process(model_df)
    elif key == 'condistions':
        remove = columns_to_remove() + transfer_info[key]
        model_df = df.drop(columns=remove)
        results = process(model_df)
    elif key == 'removables':
        remove = columns_to_remove() + transfer_info[key]
        model_df = df.drop(columns=remove) 
        results = process(model_df)

    elif 'Group' in key:
        for second_key in transfer_info[key].keys():
            if 'reference' in second_key:
                ref = second_key
            else:
                new = second_key

        for contorl in [True, False]:
            if contorl:
                remove = columns_to_remove()
                model_df = df.drop(columns=remove)
            else:
                remove = columns_to_remove(dont_remove=transfer_info[key][new])
                model_df = df.drop(columns=remove)
                
            results = process(model_df)
            results["Datasets - 2"] = second_key
            results["Control"] = contorl
            results["Datasets - 1"] = key
            all_results = pd.concat([all_results, results])

    results["Datasets - 1"] = key
    all_results = pd.concat([all_results, results])

fullList
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




missingGroup
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




removables
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




qualities
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




condistions
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




ageGroup
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




bathGroup
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




sqftGroup
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




outdoorGroup
{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




{'name': 'Linear Regression', 'model': LinearRegression()}
{'name': 'Random Forset', 'model': RandomForestRegressor()}
{'name': 'Gradient Boosting', 'model': GradientBoostingRegressor()}
{'name': 'Stochastic Gradient Descent', 'model': SGDRegressor()}
{'name': 'Support Vector Machine', 'model': LinearSVR()}




In [360]:
# fix the normalzied MAEs
sale_price = pd.read_csv("Housing_Prices_EDA.csv")["SalePrice"]
sale_price_min = sale_price.min()
sale_price_max = sale_price.max()

def unnormalize(value):
    return (value * (sale_price_max - sale_price_min)) + sale_price_min

In [361]:
all_results["MAE"] = all_results["MAE"].apply(unnormalize)
all_results.head()    

Unnamed: 0,Model Name,MAE,Feature Amount,Datasets - 1,Datasets - 2,Control
0,Linear Regression,6833789000000000.0,327,fullList,,
0,Random Forset,50835.17,327,fullList,,
0,Gradient Boosting,50238.75,327,fullList,,
0,Stochastic Gradient Descent,55384.84,327,fullList,,
0,Support Vector Machine,52486.87,327,fullList,,


In [362]:
all_results.to_csv("Model_Results.csv", index=False)