In [1]:
import numpy as np
import pandas as pd

from scipy import stats
import itertools
from sklearn import linear_model
from numpy import ones,vstack
from numpy.linalg import lstsq

In [2]:
df = pd.read_csv("./../data/Ames_Housing_Price_Data_raw.csv", index_col = 0)

In [3]:
# reset index bc data has rows with same indices, i.e. indices start over at 1 at certain points
df = df.reset_index(drop = True)

# Type dictionaries

In [4]:
typedict = {'PID' : 'nominal',
            'SalePrice' : 'continuous',
            #Matt
            'LotFrontage' : 'continuous', 
            'LotArea' : 'continuous',
            'maybe_LotShape' : 'nominal',
            'LandSlope' : 'nominal', 
            'LandContour' : 'nominal', 
            'maybe_MSZoning' : 'nominal', 
            'Street_paved' : 'nominal', 
            'Alley' : 'nominal',
            'Neighborhood' : 'nominal', 
            'drop_LotConfig' : 'nominal', 
            'drop_Condition1' : 'nominal', 
            'drop_Condition2' : 'nominal',
            'Foundation' : 'nominal',
            'Utilities' : 'nominal',
            'Heating' : 'nominal',
            'HeatingQC_nom' : 'ordinal',
            'CentralAir' : 'nominal',
            'Electrical' : 'nominal',
            'HeatingQC_ord' : 'ordinal',
            'LotShape_com' : 'nominal',
            'MSZoning_com' : 'nominal',
            'LF_Normal' : 'nominal',
            'LF_Near_NS_RR' : 'nominal',
            'LF_Near_Positive_Feature' : 'nominal',
            'LF_Adjacent_Arterial_St' : 'nominal',
            'LF_Near_EW_RR' : 'nominal',
            'LF_Adjacent_Feeder_St' : 'nominal',
            'LF_Near_Postive_Feature' : 'nominal',
            'Heating_com' : 'nominal',
            'Electrical_com' : 'nominal',
            'LotConfig_com' : 'nominal', 
            'LotFrontage_log' : 'continuous',
            'LotArea_log' : 'continuous',
            #Oren 
            'MiscFeature': 'Nominal',
            'Fireplaces': 'Discrete',
            'FireplaceQu': 'Ordinal',
            'PoolQC': 'Ordinal',
            'PoolArea': 'Continuous',
            'PavedDrive': 'Nominal',
            'ExterQual': 'Ordinal',
            'OverallQual': 'Ordinal',
            'drop_OverallCond': 'Ordinal',
            'MiscVal': 'Continuous',
            'YearBuilt': 'Discrete',
            'YearRemodAdd': 'Discrete',
            'KitchenQual': 'Ordinal',
            'Fence': 'Ordinal',
            'RoofStyle': 'Nominal',
            'RoofMatl': 'Nominal',
            'maybe_Exterior1st': 'Nominal',
            'drop_Exterior2nd': 'Nominal',
            'drop_ExterCond': 'Ordinal',
            'maybe_MasVnrType': 'Nominal',
            'MasVnrArea': 'Continuous',
            #Mo
            #Basement
            'BsmtQual_ord': 'Ordinal',
            'BsmtCond_ord': 'Ordinal',
            'BsmtExposure_ord': 'Ordinal',
            'BsmtQual_ord_lin': 'Ordinal',
            'BsmtCond_ord_lin': 'Ordinal',
            'BsmtExposure_ord_lin': 'Ordinal',
            'TotalBsmtSF': 'Continuous',
            'BSMT_GLQ':'Continuous', 
            'BSMT_Rec':'Continuous',
            'maybe_BsmtUnfSF': 'Continuous',
            'maybe_BSMT_ALQ':'Continuous',
            'maybe_BSMT_BLQ':'Continuous', 
            'maybe_BSMT_LwQ':'Continuous', 
            'drop_BsmtQual': 'Nominal',
            'drop_BsmtCond': 'Nominal',
            'drop_BsmtExposure': 'Nominal',
            'drop_BsmtFinType1': 'Nominal',
            'drop_BsmtFinSF1': 'Continuous',
            'drop_BsmtFinType2': 'Nominal',
            'drop_BsmtFinSF2': 'Continuous',
            #Deck
            'WoodDeckSF':'Continuous', 
            'OpenPorchSF':'Continuous', 
            'ScreenPorch':'Continuous',
            'maybe_EnclosedPorch':'Continuous',
            'maybe_3SsnPorch':'Continuous',
            #Garage
            'GarageFinish':'Nominal', 
            'GarageYrBlt':'Continuous',
            'GarageCars':'Ordinal',
            'GarageArea':'Continuous',
            'GarageType_con':'Nominal',
            'maybe_GarageQual':'Nominal', 
            'maybe_GarageCond':'Nominal',
            'drop_GarageType':'Nominal'
}

In [5]:
#Categorization of original variables
general=['PID','SalePrice']
lot_aspects=['LotFrontage','LotArea','LotShape','LandSlope','LandContour']
building_size=['MSSubClass','BldgType','HouseStyle','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd']
location=['MSZoning','Street','Alley','Neighborhood']
location_aspects=['LotConfig','Condition1','Condition2']
amenities=['MiscFeature','Fireplaces','FireplaceQu','PoolQC','PoolArea','PavedDrive']
garage=['GarageFinish','GarageType','GarageYrBlt','GarageCars','GarageArea','GarageQual','GarageCond']
decks=['WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch']
basement=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
utilities=['Foundation','Utilities','Heating','HeatingQC','CentralAir','Electrical']
quality_ratings=['ExterQual','OverallQual','OverallCond','MiscVal','YearBuilt','YearRemodAdd','KitchenQual','Fence','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','ExterCond','MasVnrType','MasVnrArea']
sales_aspect=['Functional','SaleCondition','SaleType','MoSold','YrSold']

# Matt

In [6]:
# ordinalize heating quality ratings
HousingQC_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
}

df.loc[df['HeatingQC'].isna(),'HeatingQC']='0'
df['HeatingQC_ord']=df['HeatingQC'].map(lambda x: HousingQC_dict[x])
df.rename(columns={'HeatingQC':'HeatingQC_nom'}, inplace=True)

In [7]:
# LotShape: combine IR2 (moderately irregular) and IR3 (irregular) into 'Irregular' due to small sample sizes
lot_shape_dict = {
    'Reg':'Regular',
    'IR1':'Slightly irregular',
    'IR2':'Irregular',
    'IR3': 'Irregular'
}
df['LotShape_com'] = df['LotShape'].map(lambda x: lot_shape_dict[x] if x in lot_shape_dict else x)
df.rename(columns={'LotShape':'maybe_LotShape'}, inplace=True)

In [8]:
# LandSlope: combine Mod (moderate) and Sev (severe) into 'Moderate-severe' due to small sample sizes
land_slope_dict = {
    'Gtl':'Gentle',
    'Mod':'Moderate-severe',
    'Sev':'Moderate-severe'
}
df['LandSlope'] = df['LandSlope'].map(lambda x: land_slope_dict[x] if x in land_slope_dict else x)

In [9]:
# rename missing values in Alley column to 'No alley access'
alley_dict = {
    'Pave':'Paved',
    'Grvl':'Gravel',
    'No alley access' : 'No alley access'
}
df.loc[df['Alley'].isna(),'Alley'] = 'No alley access'
df['Alley'] = df['Alley'].map(lambda x: alley_dict[x] if x in alley_dict else x)

In [10]:
# simple renaming LandContour values for clarity
LandContour_dict = {
    'Lvl':'Level',
    'Bnk':'Banked (rise from street level to building)',
    'HLS' : 'Hillside (downward slope on both sides)',
    'Low' : 'Depression (upward slope on both sides)'
}

df['LandContour'] = df['LandContour'].map(lambda x: LandContour_dict[x] if x in LandContour_dict else x)

In [11]:
# Combine C(all) (commercial), I(all) (industrial), and A(agr) (agricultural) zoning types into 'Nonresidential' due to
# small sample sizes and the fact that we are focusing on residential sales
MSZoning_dict = {
    'RL':'Residential, low-density',
    'RM':'Residential, medium-density',
    'FV' : 'Residential, village',
    'RH' : 'Residential, high-density',
    'C (all)' : 'Nonresidential',
    'I (all)' : 'Nonresidential',
    'A (agr)' : 'Nonresidential'
}

df['MSZoning_com'] = df['MSZoning'].map(lambda x: MSZoning_dict[x] if x in MSZoning_dict else x)
df.rename(columns={'MSZoning':'maybe_MSZoning'}, inplace=True)

In [12]:
# combine 'Near (within 200 ft)' and 'Adjacent to' into 'Near' for North-South RR, East-West RR, and positive features (parks, greenways, etc)
# renaming them LF_<factor> for Location Factor instead of condition to avoid confusion, as condition is also used to describe
# state of maintenance of various other features in the dataset
Condition_dict = {
    'Norm' : 'LF_Normal',
    'RRAn' : 'LF_Near_NS_RR',
    'PosN' : 'LF_Near_Positive_Feature',
    'Artery' : 'LF_Adjacent_Arterial_St',
    'RRAe' : 'LF_Near_EW_RR',
    'Feedr' : 'LF_Adjacent_Feeder_St',
    'PosA' : 'LF_Near_Postive_Feature',
    'RRNn' : 'LF_Near_NS_RR',
    'RRNe' : 'LF_Near_EW_RR'
}

df['Condition1'] = df['Condition1'].map(lambda x: Condition_dict[x] if x in Condition_dict else x)
df['Condition2'] = df['Condition2'].map(lambda x: Condition_dict[x] if x in Condition_dict else x)

In [13]:
def combine_condition_columns(df, factors):
    '''
    combines the "Condition1_com" and "Condition2_com" columns into a set of dummies for the values in those 2 columns
    '''
    for i in range(0, df.shape[0]):
        for factor in factors:
            if df.loc[i, 'Condition1'] == factor or df.loc[i, 'Condition2'] == factor:
                df.loc[i, f'{factor}'] = '1'
            else:
                df.loc[i, f'{factor}'] = '0'
    return df

In [14]:
loc_factors = Condition_dict.values()
df = combine_condition_columns(df, loc_factors)

In [15]:
# recommend drop condition columns
df.rename(columns={'Condition1':'drop_Condition1'}, inplace=True)
df.rename(columns={'Condition2':'drop_Condition2'}, inplace=True)

In [16]:
df.rename(columns={'Street':'Street_paved'}, inplace=True) # renaming 'Street' to 'Street_paved'

In [17]:
# Utilities: simple renaming for clarity
Utilities_dict = {
    'AllPub':'EGWS',
    'NoSewr':'EGW with septic tank'
}

df['Utilities'] = df['Utilities'].map(lambda x: Utilities_dict[x] if x in Utilities_dict else x)

In [18]:
# combine 'Gravity furnace', 'Other water/steam heating', 'Floor furnace', and 'Wall furnace' into 'Other' due to
# small sample size
Heating_dict = {
    'GasA':'Gas-powered forced-air heating',
    'GasW':'Gas-powered water/steam heating',
    'Grav' : 'Other',
    'OthW' : 'Other',
    'Floor' : 'Other',
    'Wall' : 'Other'
}

df['Heating_com'] = df['Heating'].map(lambda x: Heating_dict[x] if x in Heating_dict else x)

In [19]:
# renaming for clarity and combining FuseP and FuseF categories due to small sample size
# they are also the 2 most undesirable electrical setups as reported by the data dictionary
Electrical_dict = {
    'SBrkr': 'Standard circuit breakers, all Romex wiring',
    'FuseA': '>60 Amp fuse box, all Romex wiring',
    'FuseF' : '60 Amp fuse box, Romex or older wiring',
    'FuseP' : '60 Amp fuse box, Romex or older wiring'
}

df['Electrical_com'] = df['Electrical'].map(lambda x: Electrical_dict[x] if x in Electrical_dict else x)

In [20]:
# combined FR2 (2 sides frontage) and FR3 (3 sides frontage) into 2+ sides frontage due to small sample size
LotConfig_dict = {
    'Inside': 'Inside lot (1 side frontage)',
    'Corner': 'Corner lot',
    'CulDSac' : 'Cul-de-sac lot',
    'FR2' : '2+ sides frontage',
    'FR3' : '2+ sides frontage'
}

df['LotConfig_com'] = df['LotConfig'].map(lambda x: LotConfig_dict[x] if x in LotConfig_dict else x)
df.rename(columns={'LotConfig':'drop_LotConfig'}, inplace=True)

In [21]:
df['LotFrontage_log'] = np.log(df['LotFrontage'])

In [22]:
df['LotArea_log'] = np.log(df['LotArea'])

# Oren

In [23]:
Cond_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
       'NA':0,
        '0':0
}

In [24]:
df.loc[df['ExterQual'].isna(),'ExterQual']='0'
df['ExterQual']=df['ExterQual'].map(lambda x: Cond_dict[x])

df.loc[df['ExterCond'].isna(),'ExterCond']='0'
df['ExterCond']=df['ExterCond'].map(lambda x: Cond_dict[x])

df.loc[df['KitchenQual'].isna(),'KitchenQual']='0'
df['KitchenQual']=df['KitchenQual'].map(lambda x: Cond_dict[x])

df.loc[df['FireplaceQu'].isna(),'FireplaceQu']='0'
df['FireplaceQu']=df['FireplaceQu'].map(lambda x: Cond_dict[x])

In [25]:
Paved_Drive_Dict={
       'Y':'Paved' ,
       'P':'Partial Pavement',
       'N':'Dirt Gravel'
}
df['PavedDrive']=df['PavedDrive'].map(lambda x: Paved_Drive_Dict[x] if x != 'NA' else x)

Fence_Dict={
       'GdPrv':'Good Privacy',
       'MnPrv':'Minimum Privacy',
       'GdWo':'Good Wood',
       'MnWw':'Minimum Wood/Wire',
       'NA':'No Fence'
}
df.loc[df['Fence'].isna(),'Fence'] = 'NA'
df['Fence']=df['Fence'].map(lambda x: Fence_Dict[x])

Misc_Feature_Dict={
       'Elev':'Elevator',
       'Gar2':'2nd Garage',
       'Othr':'Other',
       'Shed':'Shed',
       'TenC':'Tennis Court',
       'NA':'Nothing'
}
df.loc[df['MiscFeature'].isna(),'MiscFeature'] = 'NA'
df['MiscFeature']=df['MiscFeature'].map(lambda x: Misc_Feature_Dict[x])


Roof_Style_Dict={
       'Flat':'Flat',
       'Gable':'Gable',
       'Gambrel':'Gabrel Barn',
       'Hip':'Hip',
       'Mansard':'Mansard',
       'Shed':'Shed'
}
df['RoofStyle']=df['RoofStyle'].map(lambda x: Roof_Style_Dict[x] if x != 'NA' else x)  
    
    
Roof_Matl_Dict={
       'ClyTile':'Clay or Tile',
       'CompShg':'Standard (Composite) Shingle',
       'Membran':'Membrane',
       'Metal':'Metal',
       'Roll':'Roll',
       'Tar&Grv':'Gravel & Tar',
       'WdShake':'Wood Shakes',
       'WdShngl':'Wood Shingles'
}
df['RoofMatl']=df['RoofMatl'].map(lambda x: Roof_Matl_Dict[x] if x != 'NA' else x)    
    
Exterior_Dict={
       'AsbShng':'Asbestos Shingles',
       'AsphShn':'Asphalt Shingles',
       'BrkComm':'Brick Common',
       'BrkFace':'Brick Face',
       'CBlock':'Cinder Block',
       'CemntBd':'Cement Board',
       'CmentBd':'Cement Board',
       'HdBoard':'Hard Board',
       'ImStucc':'Imitation Stucco',
       'MetalSd':'Metal Siding',
       'Other':'Other',
       'Plywood':'Plywood',
       'PreCast':'PreCast',
       'Stone':'Stone',
       'Stucco':'Stucco',
       'VinylSd':'Vinyl Siding',
       'Wd Sdng':'Wood Siding',
       'WdShing':'Wood Shingles',
       'Wd Shng':'Wood Shingles',
        'Brk Cmn':'Brick Common'
}
df['Exterior1st']=df['Exterior1st'].map(lambda x: Exterior_Dict[x] if x != 'NA' else x)
df['Exterior2nd']=df['Exterior2nd'].map(lambda x: Exterior_Dict[x] if x != 'NA' else x)

Mas_Vnr_Type_Dict={
       'BrkCmn':'Brick Common',
       'BrkFace':'Brick Face',
       'CBlock':'Cinder Block',
       'None':'None',
       'Stone':'Stone'
}
df.loc[df['MasVnrType'].isna(),'MasVnrType'] = 'None'
df['MasVnrType']=df['MasVnrType'].map(lambda x: Mas_Vnr_Type_Dict[x] if x != 'NA' else x)

In [26]:
df.rename(columns={
    # Columns to drop/maybe
    'OverallCond': 'drop_OverallCond',
    'ExterCond': 'drop_ExterCond',
    'Exterior2nd':'drop_Exterior2nd',
    'MasVnrType': 'maybe_MasVnrType',  
    'Exterior1st': 'maybe_Exterior1st',
}, inplace=True)

# Hao-Wei

# Mo

In [27]:
def variable_selection(x):
    '''
    Iterate throuh all combunaions of variables and linearly regress to find optimal variables to utilize/ drop
    '''

    lm=linear_model.LinearRegression()

    for i in range(1,len(x.columns)):
        scores = {}

        for item in set(itertools.combinations(x.columns, i)):
            lm.fit(x[list(item)], df['SalePrice'])
            scores[item]=lm.score(x[list(item)], df['SalePrice'])

        print(scores[max(scores, key=lambda key: scores[key])])
        print(max(scores, key=lambda key: scores[key]))

In [28]:
#replace nominal with ordinal variables on standard scale with even steps

def linarization_func(var_name):
    '''
    Input: ordinal variable name as string
    Function creates new variable with naming *_lin that linarizes the ordinal scale 
    based on relationship to mean sales
    Variable needs to be part of a dataframe named df, which also includes oclumn 'SalePrice'
    '''

    #linear function between min and max of mean
    meanlist=df[['SalePrice',f'{var_name}']].groupby(f'{var_name}').agg('mean')

    points = [(0,min(meanlist['SalePrice'])),(1,max(meanlist['SalePrice']))]
    x_coords, y_coords = zip(*points)
    A = vstack([x_coords,ones(len(x_coords))]).T
    m, c = lstsq(A, y_coords, rcond=None)[0]

    #loop reassigning x: current mean, future mean(x_pos on lin function)
    dict={}

    dict[min(df[f'{var_name}'].unique())]=0
    dict[max(df[f'{var_name}'].unique())]=1

    for i in df[f'{var_name}'].unique():
        if not i in dict:
            dict[i]=(meanlist.loc[meanlist.index==i,'SalePrice'][i]-c)/m

    #new value mapping dictionary
    df[f'{var_name}_lin']=df[f'{var_name}'].map(lambda x: dict[x])


Basement

In [29]:
master_dict={
       'Ex':5,
       'Gd':4,
       'TA':3,
       'Fa':2,
       'Po':1,
       'NA':0,
        '0':0
}

exp_dict={
       'Gd':4,
       'Av':3,
       'Mn':2,
       'No':1,
       'NA':0,
        '0':0
}

In [30]:
#replace nominal with ordinal variables on standard scale with even steps
df['BsmtCond_ord']=df['BsmtCond']
df.rename(columns = {'BsmtCond': 'drop_BsmtCond'}, inplace=True)
df.loc[df['BsmtCond_ord'].isna(),'BsmtCond_ord']='0'
df['BsmtCond_ord']=df['BsmtCond_ord'].map(lambda x: master_dict[x])

df['BsmtQual_ord']=df['BsmtQual']
df.rename(columns = {'BsmtQual': 'drop_BsmtQual'}, inplace=True)
df.loc[df['BsmtQual_ord'].isna(),'BsmtQual_ord']='0'
df['BsmtQual_ord']=df['BsmtQual_ord'].map(lambda x: master_dict[x])

df['BsmtExposure_ord']=df['BsmtExposure']
df.rename(columns = {'BsmtExposure': 'drop_BsmtExposure'}, inplace=True)
df.loc[df['BsmtExposure_ord'].isna(),'BsmtExposure_ord']='0'
df['BsmtExposure_ord']=df['BsmtExposure_ord'].map(lambda x: exp_dict[x])

#create linearized variables for these three prdinal variables
linarization_func('BsmtCond_ord')
linarization_func('BsmtQual_ord')
linarization_func('BsmtExposure_ord')

#drop 'unf' and 'NaN' dummies from BsmtFinType1 and BsmtFinType2 (unf covered through separate dumym already)
#need to merge dummies for BsmtFinType1 and BsmtFinType2
df['BSMT_GLQ']=0
df['BSMT_ALQ']=0
df['BSMT_BLQ']=0
df['BSMT_LwQ']=0
df['BSMT_Rec']=0

df.loc[df['BsmtFinType1'] == 'GLQ','BSMT_GLQ']=df.loc[df['BsmtFinType1'] == 'GLQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'GLQ','BSMT_GLQ']=df.loc[df['BsmtFinType2'] == 'GLQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'ALQ','BSMT_ALQ']=df.loc[df['BsmtFinType1'] == 'ALQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'ALQ','BSMT_ALQ']=df.loc[df['BsmtFinType2'] == 'ALQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'BLQ','BSMT_BLQ']=df.loc[df['BsmtFinType1'] == 'BLQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'BLQ','BSMT_BLQ']=df.loc[df['BsmtFinType2'] == 'BLQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'LwQ','BSMT_LwQ']=df.loc[df['BsmtFinType1'] == 'LwQ','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'LwQ','BSMT_LwQ']=df.loc[df['BsmtFinType2'] == 'LwQ','BsmtFinSF2']

df.loc[df['BsmtFinType1'] == 'Rec','BSMT_Rec']=df.loc[df['BsmtFinType1'] == 'Rec','BsmtFinSF1']
df.loc[df['BsmtFinType2'] == 'Rec','BSMT_Rec']=df.loc[df['BsmtFinType2'] == 'Rec','BsmtFinSF2']

df.rename(columns = {'BsmtFinType1': 'drop_BsmtFinType1','BsmtFinSF1': 'drop_BsmtFinSF1','BsmtFinType2': 'drop_BsmtFinType2','BsmtFinSF2': 'drop_BsmtFinSF2'}, inplace=True)

df.loc[df['TotalBsmtSF'].isna(),'TotalBsmtSF']=0
df.loc[df['BsmtUnfSF'].isna(),'BsmtUnfSF']=0

#further columns I recommend we drop, based on them not having any effect by themselves on predicting sales prices
df.rename(columns = {'BsmtUnfSF': 'maybe_BsmtUnfSF','BSMT_ALQ': 'maybe_BSMT_ALQ','BSMT_BLQ': 'maybe_BSMT_BLQ','BSMT_LwQ': 'maybe_BSMT_LwQ','BsmtExposure': 'maybe_BsmtExposure'}, inplace=True)


Porches/ Decks

In [31]:
df.rename(columns = {'EnclosedPorch': 'maybe_EnclosedPorch','3SsnPorch': 'maybe_3SsnPorch'}, inplace=True)


Garage

In [32]:
#consolidate Garage Types based on better predicitve power and low impact of other types
garagetype={
   'Detchd':'Detchd', 
    'Attchd':'Attchd', 
    'BuiltIn':'BuiltIn', 
    'Basment':'Detchd',  
    '2Types':'Detchd', 
    'CarPort':'Detchd',
    '0':'0'
}

In [33]:
df['GarageType_con']=df['GarageType']
df.rename(columns = {'GarageType': 'drop_GarageType'}, inplace=True)
df.loc[df['GarageType_con'].isna(),'GarageType_con']='0'
df['GarageType_con']=df['GarageType_con'].map(lambda x: garagetype[x])

#drop GarageCond, GarageQual (basically no value, also almost all values are consolidated in one status)
df.rename(columns = {'GarageCond': 'maybe_GarageCond','GarageQual': 'maybe_GarageQual'}, inplace=True)

#keep year, area, Finish, cars as is, all have strong predictive power and do not seem to allow for easy consolidation


In [34]:
df

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,maybe_MSZoning,LotFrontage,LotArea,Street_paved,Alley,maybe_LotShape,...,BsmtExposure_ord,BsmtCond_ord_lin,BsmtQual_ord_lin,BsmtExposure_ord_lin,BSMT_GLQ,maybe_BSMT_ALQ,maybe_BSMT_BLQ,maybe_BSMT_LwQ,BSMT_Rec,GarageType_con
0,909176150,856,126000,30,RL,,7890,Pave,No alley access,Reg,...,1,0.770108,0.232877,0.338552,0.0,0.0,0.0,0.0,238.0,Detchd
1,905476230,1049,139500,120,RL,42.0,4235,Pave,No alley access,Reg,...,2,0.770108,0.478677,0.486626,552.0,393.0,0.0,0.0,0.0,Attchd
2,911128020,1001,124900,30,C (all),60.0,6060,Pave,No alley access,Reg,...,1,0.770108,0.232877,0.338552,0.0,737.0,0.0,0.0,0.0,Detchd
3,535377150,1039,114000,70,RL,80.0,8146,Pave,No alley access,Reg,...,1,0.770108,0.117665,0.338552,0.0,0.0,0.0,0.0,0.0,Detchd
4,534177230,1665,227000,60,RL,70.0,8400,Pave,No alley access,Reg,...,1,0.770108,0.478677,0.338552,643.0,0.0,0.0,0.0,0.0,Attchd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,903205040,952,121000,30,RL,,8854,Pave,No alley access,Reg,...,1,0.770108,0.232877,0.338552,0.0,0.0,0.0,0.0,0.0,Detchd
2576,905402060,1733,139600,20,RL,,13680,Pave,No alley access,IR1,...,0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,Attchd
2577,909275030,2002,145000,90,RH,82.0,6270,Pave,No alley access,Reg,...,1,0.770108,0.232877,0.338552,0.0,0.0,284.0,0.0,0.0,Detchd
2578,907192040,1842,217500,60,RL,,8826,Pave,No alley access,Reg,...,1,0.770108,0.478677,0.338552,841.0,0.0,0.0,0.0,0.0,Attchd
