In [138]:
import pandas as pd
import numpy as np
import plotly.express as px
import json

In [139]:
# import dataset
df = pd.read_csv("Housing_Prices_EDA.csv")
print(df.shape)
df.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,2-STORY 1946 & NEWER,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2008,WD,Normal,208500
1,2,1-STORY 1946 & NEWER ALL STYLES,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,5,2007,WD,Normal,181500
2,3,2-STORY 1946 & NEWER,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,9,2008,WD,Normal,223500
3,4,2-STORY 1945 & OLDER,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml,140000
4,5,2-STORY 1946 & NEWER,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,12,2008,WD,Normal,250000


In [140]:
# import features transfer object
transfer_info_json = open("feature_lists.json", 'r')
transfer_info = transfer_info_json.read()
transfer_info = json.loads(transfer_info)
print(transfer_info)

{'fullList': ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', '

Cleaning Ideas from EDA
1. create age feature, age of revno

2. total bathrooms

3. Work out Square footage features, do you need everything? maybe total sqft

4. condence porch/deck features

5. Only use dummy yes/no has feature, not quality, condition or description of item (simple dummy)

6. 1 Exterior feature

7. combine conditions/qualities, pick condition or quality feature, not both

8. Do you need house subclass?

9. how will you deal with outliers? Standardize? natural log?

10. If you're modeling "housing prices" you should remove non-houses rows in MSZoning. Also Maybe focus on "Normal" SaleCondition

In [141]:
def viz_data(feature_list):
    for feature in feature_list:
        fig = px.box(df, x=feature)
        fig.show()

In [142]:
# create age features
df['home_age_at_sale'] = df['YrSold'] - df['YearBuilt']
df['remodel_age_at_sale'] = df['YrSold'] - df['YearRemodAdd']
df['garage_age_at_sale'] = df['YrSold'] - df['GarageYrBlt']

# create comparing lists
age_ref_features = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
age_features = ['home_age_at_sale', 'remodel_age_at_sale', 'garage_age_at_sale']

# adding new groups to features list
transfer_info["ageGroup"] = {
    "newAge": age_features,
    "referenceAge": age_ref_features,
}

# visualize new features
viz_data(age_features)

In [143]:
# total bathrooms
df['total_full_bathrooms'] = df['BsmtFullBath'] + df['FullBath']
df['total_half_bathrooms'] = df['BsmtHalfBath'] = df['HalfBath']


bath_ref_features = ['BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath']
bath_features = ['total_full_bathrooms', 'total_half_bathrooms']

transfer_info["bathGroup"] = {
    "newBaths": bath_features,
    "referenceBaths": bath_ref_features,
}

viz_data(bath_features)

In [144]:
# square footage
df['total_finished_basement_sqft'] = df['BsmtFinSF1'] + df["BsmtFinSF2"]
df['total_living_sqft'] = df['total_finished_basement_sqft'] + df['GrLivArea']
df['total_sqft'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

sqft_ref_features = ['BsmtFinSF1', "BsmtFinSF2", 'GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF']
sqft_features = ['total_finished_basement_sqft', 'total_living_sqft', 'total_sqft']

transfer_info["sqftGroup"] = {
    "newSQFT": sqft_features,
    "referenceSQFT": bath_ref_features,
}

viz_data(sqft_features)

In [145]:
# porch/deck 
df['outdoor_living_sqft'] = df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']

outdoor_ref_features =  ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
outdoor_features = ['outdoor_living_sqft']

transfer_info["outdoorGroup"] = {
    "newOutdoor": outdoor_features,
    "referenceOutdoor": outdoor_ref_features,
}

viz_data(outdoor_features)

In [146]:
# simplify dummies ("NA", "None")
dummies_ref_features = ['Alley', 'MasVnrType', "BsmtQual", "FireplaceQu", "GarageType", "PoolQC", "Fence", "MiscFeature"]

simple_dummies = []
for dummies in dummies_ref_features:
    new_name = "{}_simple_dummy".format(dummies)
    df[new_name] = df[dummies].map(lambda x: 0 if x == "Missing" else 1)
    simple_dummies.append(new_name)
    
transfer_info["missingGroup"]["newMissing"] = simple_dummies

In [147]:
def normalize(df, col):
    """
    This function will normalize a give column.
    """
    col_min = df[col].min()
    col_max = df[col].max()
    
    new_col = []
    for value in df[col]:
        try:
            new_value = (value - col_min)/(col_max - col_min)
        except ZeroDivisionError:
            new_value = 0
            
        new_col.append(new_value)
            
    return new_col

In [148]:
def normalize_df(df):
    """
    this function will convert some numeric features to strings then normalize the entire submitted dataframe.
    """
    num_feat = list(df.select_dtypes(exclude='object'))

    for feature in num_feat:
        df[feature] = normalize(df, feature)

    return df

In [149]:
# Data Cleaning

# filter to "normal sales"
df = df[df["SaleCondition"] == 'Normal']
df.drop(columns="SaleCondition", inplace=True)

# filter to only Residential homes
df = df[(df['MSZoning'] != 'A') |
       (df['MSZoning'] != 'C') |
       (df['MSZoning'] != 'FV') |
       (df['MSZoning'] != 'I')]

# convert to string so these features do not get normalized
df['MoSold'] = df['MoSold'].astype(str)
df['YrSold'] = df['YrSold'].astype(str)

# normalize data
df = normalize_df(df)

print(df.shape)
df.head()

(1198, 97)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,total_sqft,outdoor_living_sqft,Alley_simple_dummy,MasVnrType_simple_dummy,BsmtQual_simple_dummy,FireplaceQu_simple_dummy,GarageType_simple_dummy,PoolQC_simple_dummy,Fence_simple_dummy,MiscFeature_simple_dummy
0,0.0,2-STORY 1946 & NEWER,RL,0.207668,0.03342,Pave,Missing,Reg,Lvl,AllPub,...,0.347339,0.059396,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.000685,1-STORY 1946 & NEWER ALL STYLES,RL,0.255591,0.038795,Pave,Missing,Reg,Lvl,AllPub,...,0.340803,0.290166,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,0.001371,2-STORY 1946 & NEWER,RL,0.217252,0.046507,Pave,Missing,IR1,Lvl,AllPub,...,0.369125,0.040896,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
4,0.002742,2-STORY 1946 & NEWER,RL,0.268371,0.060576,Pave,Missing,IR1,Lvl,AllPub,...,0.468254,0.268744,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
5,0.003427,1-1/2 STORY FINISHED ALL AGES,RL,0.271565,0.059899,Pave,Missing,IR1,Lvl,AllPub,...,0.283847,0.379747,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [150]:
transfer_info["fullList"] = list(df)

In [151]:
# convert to JSON
transfer_info_json = json.dumps(transfer_info, indent=4)
print(transfer_info_json)

{
    "fullList": [
        "Id",
        "MSSubClass",
        "MSZoning",
        "LotFrontage",
        "LotArea",
        "Street",
        "Alley",
        "LotShape",
        "LandContour",
        "Utilities",
        "LotConfig",
        "LandSlope",
        "Neighborhood",
        "Condition1",
        "Condition2",
        "BldgType",
        "HouseStyle",
        "OverallQual",
        "OverallCond",
        "YearBuilt",
        "YearRemodAdd",
        "RoofStyle",
        "RoofMatl",
        "Exterior1st",
        "Exterior2nd",
        "MasVnrType",
        "MasVnrArea",
        "ExterQual",
        "ExterCond",
        "Foundation",
        "BsmtQual",
        "BsmtCond",
        "BsmtExposure",
        "BsmtFinType1",
        "BsmtFinSF1",
        "BsmtFinType2",
        "BsmtFinSF2",
        "BsmtUnfSF",
        "TotalBsmtSF",
        "Heating",
        "HeatingQC",
        "CentralAir",
        "Electrical",
        "1stFlrSF",
        "2ndFlrSF",
        "LowQualFinSF

In [152]:
# save json
json_file = open("feature_lists_updated.json", "w")
json_file.write(transfer_info_json)
json_file.close()

In [153]:
# export data for modeling
df.to_csv("Housing_Prices_Modeling.csv", index=False)