# NOTEBOOK 2: PREPROCESSING

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import csv
import pickle

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

%matplotlib inline

In [2]:
df = pd.read_csv('../data/train_eda.csv', index_col='Id')

## Data Cleaning / Converting Null Values

In [3]:
df.drop(['PID'], axis=1, inplace=True)

Dropping the PID feature from our df.

Dropping Mo Sold and Yr Sold.

In [4]:
df.drop(['Mo Sold','Yr Sold'], axis=1, inplace=True)

Dropping Pool QC feature

In [5]:
df.drop(['Pool QC'], axis=1, inplace=True)

Cross-checking NaNs for 'Misc Feature' with 'Misc Val' to ensure NaNs correspond to 0 values. Since these features are colinear we will fill NaNs according to the existing feature distributions and encode misc feature existance in a separate column,

In [6]:
df[['Misc Feature','Misc Val']][(df['Misc Feature'].isna() == True) & (df['Misc Val'] != 0)]

Unnamed: 0_level_0,Misc Feature,Misc Val
Id,Unnamed: 1_level_1,Unnamed: 2_level_1


Since all NaNs are unrepresented in Misc Val we know these are homes without a misc feature.

In [7]:
df['Misc Feature'].value_counts()

Shed    55
Gar2     4
Othr     3
TenC     1
Elev     1
Name: Misc Feature, dtype: int64

Rename Misc Feature to Misc Feature Type

In [8]:
df['Misc Feature'].fillna('none',inplace=True)

In [9]:
df['Misc Feature'].value_counts()

none    1985
Shed      55
Gar2       4
Othr       3
TenC       1
Elev       1
Name: Misc Feature, dtype: int64

Since Alley is a unique feature converting NaNs to 'None' would be the most appropriate to prepare for one-hot encoding.

In [10]:
df['Alley'].fillna(value='none', inplace=True)

Converting NaNs to 'None' in Fence to prepare for one-hot encoding.

In [11]:
df['Fence'].fillna(value='none', inplace=True)

Since Fireplace Qu is colinear with Fireplaces converting NaNs to 'none' would be most appropriate. 

In [12]:
df['Fireplace Qu'].value_counts()

Gd    523
TA    405
Fa     59
Ex     31
Po     31
Name: Fireplace Qu, dtype: int64

In [13]:
df['Fireplace Qu'].fillna('none', inplace=True)

Reviewing NaNs for Lot Frontage before converting to 0.0.

In [14]:
df[df['Lot Frontage'].isna()==True].head()

Unnamed: 0_level_0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Fence,Misc Feature,Misc Val,Sale Type,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,60,RL,,13517,Pave,none,IR1,Lvl,AllPub,CulDSac,...,44,0,0,0,0,none,none,0,WD,130500
145,20,RL,,12160,Pave,none,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,MnPrv,none,0,COD,142000
1942,20,RL,,15783,Pave,none,Reg,Lvl,AllPub,Inside,...,324,0,0,0,0,MnPrv,Shed,400,WD,112500
12,20,RL,,7980,Pave,none,IR1,Lvl,AllPub,Inside,...,21,0,0,0,0,GdPrv,Shed,500,WD,185000
1534,50,RL,,11700,Pave,Grvl,IR1,HLS,AllPub,Inside,...,40,0,0,0,0,none,none,0,WD,198000


In [15]:
df['Lot Frontage'].fillna(value=0.0, inplace=True)

Converting NaNs to 'None' or 0 for Garage Cond / Garage Qual / Garage Yr Blt / Garage Finish / Garage Type after confirming correct values for Garage Type.

In [16]:
df[['Garage Cond','Garage Type']][(df['Garage Cond'].isna()==True) | (df['Garage Type'].isna()==True)].tail(20)

Unnamed: 0_level_0,Garage Cond,Garage Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1132,,
2290,,
853,,
2237,,Detchd
2671,,
898,,
2496,,
2920,,
894,,
2837,,


Values differ for observation 2237. Will pull up by index to crosscheck all features.

In [17]:
df.loc[2237,['Garage Cond','Garage Qual','Garage Yr Blt','Garage Finish','Garage Type']]

Garage Cond         NaN
Garage Qual         NaN
Garage Yr Blt       NaN
Garage Finish       NaN
Garage Type      Detchd
Name: 2237, dtype: object

Since no information is given for any of the other features, we will assume that Detchd was a data entry error and correct all features to 'None' for categorical features and 0 for Garage Yr Blt.

In [18]:
df.loc[2237,'Garage Type'] = 'none'

In [19]:
df['Garage Type'].isna().sum()

113

Determining how to handle missing values for 'Garage Yr Blt'. Since homes with no garage are already encoded in 'Garage Type' 0 would be uninformative and non-sensical. The most reasonable fill value would be the year built. Checking for overlap.

In [20]:
(df['Year Built'] != df['Garage Yr Blt']).sum()/2051

0.23744514870794733

About ~24% of observations do not align, however this is still our best option.

In [21]:
df['Garage Cond'].fillna(value='none', inplace=True)
df['Garage Qual'].fillna(value='none', inplace=True)
df['Garage Yr Blt'].fillna(value=df['Year Built'], inplace=True)
df['Garage Finish'].fillna(value='none', inplace=True)
df['Garage Type'].fillna(value='none', inplace=True)

Confirming correct output for imputed Garage Yr Blt. Will convert from float to int.

In [22]:
df[['Garage Yr Blt','Year Built']].head()

Unnamed: 0_level_0,Garage Yr Blt,Year Built
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
109,1976.0,1976
544,1997.0,1996
153,1953.0,1953
318,2007.0,2006
255,1957.0,1900


In [23]:
df['Garage Yr Blt'] = df['Garage Yr Blt'].astype(int)

Converting NaNs to 'None' for Bsmt Exposure / BsmtFin Type 2 / Bsmt Qual / BsmtFin Type 1 / Bsmt Cond after crosschecking values.

In [24]:
df[['Bsmt Exposure',
    'BsmtFin Type 2',
    'Bsmt Qual',
    'BsmtFin Type 1',
    'Bsmt Cond']
  ][(df['Bsmt Exposure'].isna()==True) | (df['Bsmt Qual'].isna()==True)].tail(10)

Unnamed: 0_level_0,Bsmt Exposure,BsmtFin Type 2,Bsmt Qual,BsmtFin Type 1,Bsmt Cond
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
815,,,,,
1899,,,,,
2004,,,,,
2627,,,,,
2625,,,,,
2279,,,,,
810,,,,,
2338,,,,,
2780,,Unf,Gd,Unf,TA
2880,,,,,


Since 4/5 features represented for observations 1797, 67, 2780 we will infer that NaN for 'Bsmt Exposure' is a data entry error and replace with 'No'.

In [25]:
df.loc[1797,'Bsmt Exposure'] = 'No'
df.loc[67,'Bsmt Exposure'] = 'No'
df.loc[2780,'Bsmt Exposure'] = 'No'

In [26]:
df[['Bsmt Exposure',
    'BsmtFin Type 2',
    'Bsmt Qual',
    'BsmtFin Type 1',
    'Bsmt Cond']
  ][(df['BsmtFin Type 2'].isna()==True) | (df['Bsmt Qual'].isna()==True)].head()

Unnamed: 0_level_0,Bsmt Exposure,BsmtFin Type 2,Bsmt Qual,BsmtFin Type 1,Bsmt Cond
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
807,,,,,
811,,,,,
781,,,,,
888,,,,,
1554,,,,,


Values differ for observation 445. Since 4/5 features represented, we will consider NaN a data entry error. Will pull up entry to determine how to replace NaN.

In [27]:
df.loc[445,['Bsmt Exposure',
            'BsmtFin Type 2',
            'Bsmt Qual',
            'BsmtFin Type 1',
            'Bsmt Cond',
            'BsmtFin SF 1',
            'BsmtFin SF 2',
            'Bsmt Unf SF', 
            'Total Bsmt SF']
      ]

Bsmt Exposure       No
BsmtFin Type 2     NaN
Bsmt Qual           Gd
BsmtFin Type 1     GLQ
Bsmt Cond           TA
BsmtFin SF 1      1124
BsmtFin SF 2       479
Bsmt Unf SF       1603
Total Bsmt SF     3206
Name: 445, dtype: object

Since this observation shows Bsmt Unf SF = 1603 we know that BsmtFin Type 2 should be Unf.

In [28]:
df.loc[445,'BsmtFin Type 2'] = 'Unf'

Confirming that all categories are now aligned before converting NaNs to 'None'.

In [29]:
df[['Bsmt Exposure','BsmtFin Type 2','Bsmt Qual','BsmtFin Type 1','Bsmt Cond']].isna().sum()

Bsmt Exposure     55
BsmtFin Type 2    55
Bsmt Qual         55
BsmtFin Type 1    55
Bsmt Cond         55
dtype: int64

Since we want to preserve information on whether or not a basement exists we will create a binary category Basement where nulls encode homes with no basement.

In [30]:
df['Basement'] = df['Bsmt Qual'].map(lambda x: 1 if type(x)==str else 0)

In [31]:
df[['Basement','Bsmt Exposure','BsmtFin Type 2','Bsmt Qual','BsmtFin Type 1','Bsmt Cond']].head()

Unnamed: 0_level_0,Basement,Bsmt Exposure,BsmtFin Type 2,Bsmt Qual,BsmtFin Type 1,Bsmt Cond
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
109,1,No,Unf,TA,GLQ,TA
544,1,No,Unf,Gd,GLQ,TA
153,1,No,Unf,TA,GLQ,TA
318,1,No,Unf,Gd,Unf,TA
255,1,No,Unf,Fa,Unf,Gd


Filling nulls

In [32]:
df['Bsmt Exposure'].fillna(value='none', inplace=True)
df['BsmtFin Type 2'].fillna(value='none', inplace=True)
df['Bsmt Qual'].fillna(value='none', inplace=True)
df['BsmtFin Type 1'].fillna(value='none', inplace=True)
df['Bsmt Cond'].fillna(value='none', inplace=True)

Confirming NaNs for 'Mas Vnr Type' and 'Mas Vnr Area' overlap before converting to 'None' or 0. 

In [33]:
df[['Mas Vnr Type','Mas Vnr Area']][(df['Mas Vnr Type'].isna()==True) | (df['Mas Vnr Area'].isna()==True)]

Unnamed: 0_level_0,Mas Vnr Type,Mas Vnr Area
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2393,,
2383,,
539,,
518,,
2824,,
1800,,
1455,,
1120,,
1841,,
1840,,


Since these completely overlap we can safely conclude these NaNs represent homes with no vaneer. Converting to 'none' or 0.

In [34]:
df['Mas Vnr Type'].fillna(value='none', inplace=True)
df['Mas Vnr Area'].fillna(value=0, inplace=True)

Confirming NaNs for Bsmt Half Bath and Bsmt Full Bath correspond to homes with no basement by comparing to Total Bsmt SF, Bsmt Unf SF, BsmtFin SF 1 and BsmtFin SF 2.

In [35]:
df[['Bsmt Half Bath',
    'Bsmt Full Bath',
    'Total Bsmt SF',
    'Bsmt Unf SF', 
    'BsmtFin SF 1', 
    'BsmtFin SF 2']
  ][(df['Bsmt Half Bath'].isna()==True) | (df['Bsmt Full Bath'].isna()==True)]

Unnamed: 0_level_0,Bsmt Half Bath,Bsmt Full Bath,Total Bsmt SF,Bsmt Unf SF,BsmtFin SF 1,BsmtFin SF 2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1498,,,0.0,0.0,0.0,0.0
1342,,,,,,


Since Total Bsmt SF, Bsmt Unf SF, BsmtFin SF 1, and BsmtFin SF 2 are 0 or NaN for both observations we can assume these homes have no basement and therefore no bathrooms. Filling all NaNs with 0 or 0.0 since discrete or continuous numeric categories, resp.

In [36]:
df['Bsmt Half Bath'].fillna(value=0, inplace=True)
df['Bsmt Full Bath'].fillna(value=0, inplace=True)
df['Total Bsmt SF'].fillna(value=0.0, inplace=True)
df['Bsmt Unf SF'].fillna(value=0.0, inplace=True)
df['BsmtFin SF 1'].fillna(value=0.0, inplace=True)
df['BsmtFin SF 2'].fillna(value=0.0, inplace=True)

Combining Bsmt Half Bath and Bsmt Full Bath to a single category Bsmt Bath before dropping both features.

In [37]:
df['Bsmt Baths']=(df['Bsmt Half Bath']*0.5)+df['Bsmt Full Bath']

In [38]:
df[['Bsmt Baths','Bsmt Half Bath','Bsmt Full Bath']].head()

Unnamed: 0_level_0,Bsmt Baths,Bsmt Half Bath,Bsmt Full Bath
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
109,0.0,0.0,0.0
544,1.0,0.0,1.0
153,1.0,0.0,1.0
318,0.0,0.0,0.0
255,0.0,0.0,0.0


In [39]:
df.drop(['Bsmt Half Bath','Bsmt Full Bath'], axis=1, inplace=True)

Reviewing Functional feature.

In [40]:
df['Functional'].value_counts()

Typ     1913
Min1      42
Min2      42
Mod       29
Maj1      12
Maj2       7
Sev        2
Sal        2
Name: Functional, dtype: int64

Maj1, Maj2, Sal, and Sev classifications likely have too few values for this model to base accurate predictions off of. These observations will likely be excluded. Likely the Sev values capture the observed SalePrice min value. Can determine impact from plotting categories against sale price.

In [41]:
df[(df['Functional']=='Sev') | (df['Functional']=='Sal')]

Unnamed: 0_level_0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,3Ssn Porch,Screen Porch,Pool Area,Fence,Misc Feature,Misc Val,Sale Type,SalePrice,Basement,Bsmt Baths
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
754,90,RM,50.0,3000,Pave,Grvl,Reg,Bnk,AllPub,Inside,...,0,0,0,none,none,0,WD,62500,1,0.0
1554,20,A (agr),80.0,14584,Pave,none,Reg,Low,AllPub,Inside,...,0,0,0,none,none,0,WD,13100,0,0.0
2044,50,RM,60.0,10320,Pave,Grvl,Reg,Lvl,AllPub,Corner,...,0,84,0,none,none,0,COD,50000,1,0.0
1916,60,RL,0.0,18450,Pave,none,IR1,Lvl,AllPub,Inside,...,0,0,0,none,none,0,WD,129000,1,0.0


In [42]:
df[df['SalePrice']==12789.0]

Unnamed: 0_level_0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,3Ssn Porch,Screen Porch,Pool Area,Fence,Misc Feature,Misc Val,Sale Type,SalePrice,Basement,Bsmt Baths
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
182,30,RM,68.0,9656,Pave,none,Reg,Lvl,AllPub,Inside,...,0,0,0,none,none,0,WD,12789,1,0.0


Dropping this observation since this is an extreme outlier. 

In [43]:
df.drop(182,axis=0, inplace=True)

Also dropping observation 1554 since this is an extreme outlier.

In [44]:
df.drop(1554,axis=0,inplace=True)

Confirming NaNs for Garage Cars and Garage Area align with other Garage Features before converting to 0 or 0.0.

In [45]:
df[['Garage Cars', 
    'Garage Area', 
    'Garage Qual', 
    'Garage Cond',
    'Garage Yr Blt']
  ][(df['Garage Cars'].isna()==True) | (df['Garage Area'].isna()==True)]

Unnamed: 0_level_0,Garage Cars,Garage Area,Garage Qual,Garage Cond,Garage Yr Blt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2237,,,none,none,1923


In [46]:
df['Garage Cars'].fillna(value=0, inplace=True)
df['Garage Area'].fillna(value=0.0, inplace=True)

Addressing the possible data entry error identified by the max Garage Yr Blt = 2207.

In [47]:
df[df['Garage Yr Blt']==2207]

Unnamed: 0_level_0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,3Ssn Porch,Screen Porch,Pool Area,Fence,Misc Feature,Misc Val,Sale Type,SalePrice,Basement,Bsmt Baths
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2261,20,RL,68.0,8298,Pave,none,IR1,HLS,AllPub,Inside,...,0,0,0,none,none,0,New,267300,1,0.0


In [48]:
df.loc[2261,['Garage Yr Blt', 'Year Built', 'Year Remod/Add']]

Garage Yr Blt     2207
Year Built        2006
Year Remod/Add    2007
Name: 2261, dtype: object

Identifying the home with Garage Yr Blt = 2207. Considering the Year Built is 2006 and the Year Remod/Add is 2007, it's likely this was also the year the garage was built and 2207 was a data entry error and should be 2007.

In [49]:
df.loc[2261, 'Garage Yr Blt'] = 2007
df.loc[2261, 'Garage Yr Blt']

2007

Confirming that all NaNs have now been corrected.

In [50]:
df.isna().sum().sum()

0

# Feature Engineering

Converting Y/N features to binary based on Data Dictionary descriptions.

In [51]:
df['Central Air'] = df['Central Air'].map({'N':0, 'Y':1})

Since street only has two categories converting to Paved and encoding as binary.

In [52]:
df['Paved Street'] = df['Street'].map({'Grvl':0, 'Pave':1})

In [53]:
df.drop('Street', axis=1, inplace=True)

Reviewing number of entries for Paved Drive for possible consolidation into a binary feature.

In [54]:
df['Paved Drive'].value_counts()

Y    1859
N     149
P      39
Name: Paved Drive, dtype: int64

Since there are only 39 Partially paved observations we will combine with No and make binary.

In [55]:
df['Paved Drive'] = df['Paved Drive'].map({'N':0,'P':0,'Y':1})

Determining if Land Contour and Land Slope encode the same information.

In [56]:
df['Land Contour'].value_counts()

Lvl    1842
HLS      85
Bnk      80
Low      40
Name: Land Contour, dtype: int64

In [57]:
df['Land Slope'].value_counts()

Gtl    1952
Mod      87
Sev       8
Name: Land Slope, dtype: int64

In [58]:
df[['Land Contour','Land Slope']].head()

Unnamed: 0_level_0,Land Contour,Land Slope
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
109,Lvl,Gtl
544,Lvl,Gtl
153,Lvl,Gtl
318,Lvl,Gtl
255,Lvl,Gtl


Land Contour and Land Slope are very similar, one feature can likely be excluded. Can understand better after plotting relationship with SalePrice.

Determining if Garage Qual and Garage Cond can be combined.

In [59]:
df['Garage Qual'].value_counts()

TA      1830
none     114
Fa        80
Gd        18
Ex         3
Po         2
Name: Garage Qual, dtype: int64

In [60]:
df['Garage Cond'].value_counts()

TA      1866
none     114
Fa        46
Gd        12
Po         7
Ex         2
Name: Garage Cond, dtype: int64

In [61]:
(df['Garage Qual']!=df['Garage Cond']).sum()/2051

0.046318868844466114

Garage Qual and Garage Cond are very similar, only ~4.6% of the observations do not align. Will review by plotting against SalePrice.

In [62]:
df[['Garage Qual','Garage Cond']][(df['Garage Qual']!=df['Garage Cond'])].head()

Unnamed: 0_level_0,Garage Qual,Garage Cond
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1325,Fa,TA
1537,Fa,TA
1350,Fa,TA
1323,TA,Fa
2600,Fa,TA


Reviewing Porch features for overlap.

In [63]:
print(df[(df['Enclosed Porch']!= 0)]['Enclosed Porch'].count())
print(df[(df['3Ssn Porch']!= 0)]['3Ssn Porch'].count())
print(df[(df['Screen Porch']!= 0)]['Screen Porch'].count())

mask = (df['Enclosed Porch']!= 0) | (df['3Ssn Porch'] != 0) | (df['Screen Porch'] != 0)
df[['Enclosed Porch','3Ssn Porch','Screen Porch']][mask].head()

327
26
181


Unnamed: 0_level_0,Enclosed Porch,3Ssn Porch,Screen Porch
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1311,96,0,0
624,0,0,288
2243,133,0,0
2517,0,0,216
675,64,0,0


It does not appear that these features overlap. About 25% of observations represented by at least one porch feature. These will be combined into a binary feature called Porch representing whether or not a porch exists.

In [64]:
df['Porch'] = df['Enclosed Porch']+df['3Ssn Porch']+df['Screen Porch']

In [65]:
df['Porch']=df['Porch'].map(lambda x: 0 if x==0 else 1)

Confirming desired feature output.

In [66]:
df[['Porch','Enclosed Porch','3Ssn Porch','Screen Porch']].head()

Unnamed: 0_level_0,Porch,Enclosed Porch,3Ssn Porch,Screen Porch
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
109,0,0,0,0
544,0,0,0,0
153,0,0,0,0
318,0,0,0,0
255,0,0,0,0


In [67]:
df.drop(['Enclosed Porch','3Ssn Porch','Screen Porch'], axis=1, inplace=True)

Checking for redundancy in Bsmt SF features

In [68]:
((df['BsmtFin SF 1'] + df['BsmtFin SF 2'] + df['Bsmt Unf SF']) != df['Total Bsmt SF']).sum()

0

The above shows that (BsmtFin SF 1 + BsmtFin SF 2 + Bsmt Unf SF) = Total Bsmt SF for all observations. We can therefore combine BsmtFin SF 1 and Bsmt Fin SF 2 to align with BsmtFin Type 1 & BsmtFin Type 2 if combined later.

Checking for redundancy in above ground sf features.

In [69]:
mask = ((df['1st Flr SF'] + df['2nd Flr SF'])  != df['Gr Liv Area'])
df[['1st Flr SF','2nd Flr SF','Gr Liv Area']][mask].head()

Unnamed: 0_level_0,1st Flr SF,2nd Flr SF,Gr Liv Area
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2242,640,0,845
2510,520,600,1200
2041,854,0,1382
943,1013,0,1526
753,929,929,2229


These features do not represent the same information. Checking to see how many observations differ.

In [70]:
((df['1st Flr SF'] + df['2nd Flr SF'])  != df['Gr Liv Area']).sum()

33

33 observations do not align, representing ~1.5% of our dataset. Checking to see how much the difference is.

In [71]:
(df['Gr Liv Area'] - (df['1st Flr SF']+df['2nd Flr SF']))[mask]

Id
2242     205
2510      80
2041     528
943      513
753      371
1530     473
1292     108
661     1064
2195     515
724      120
1319     205
941      312
2667     572
2508      80
2697     234
178      390
1362     697
2044     114
1945     512
656      144
2853     514
910      397
2507      80
2731     140
2046     479
778      259
725      436
1330     156
1525     384
2676     360
2338      53
239      362
2842     450
dtype: int64

Since the min of 1st Flr SF and Gr Liv Area is 334.0 we can know that all observations are represented by one of the three features of interest. 33 misaligned values only represents 1.6% of our dataset so we can either choose to discard these values or accept the total given by Gr Liv Area. Since it seems that there can be a significant difference between (1st Flr SF + 2nd Flr SF) and Gr Liv Area the best option is to accept the total given by Gr Liv Area.

Checking to see if Low Qual Fin SF is already captured by other features.

In [72]:
((df['BsmtFin SF 1'] + df['BsmtFin SF 2'] + df['Gr Liv Area']) == df['Low Qual Fin SF']).sum()

0

The above shows that Low Qual Fin SF is not equivalent to the sum of all other finishes spaces captured by BsmtFin SF 1, BsmtFin SF 2, and Gr Liv Area. Determining how many observations differ.

In [73]:
(df['Low Qual Fin SF'] != 0).sum()

33

Since this feature represents such a small proportion of our dataset, ~ 1.5%, we should either exclude these values or roll them into another feature.

In [74]:
df[['Low Qual Fin SF', 'Gr Liv Area']][df['Low Qual Fin SF'] != 0].head()

Unnamed: 0_level_0,Low Qual Fin SF,Gr Liv Area
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2242,205,845
2510,80,1200
2041,528,1382
943,513,1526
753,371,2229


Since Low Qual Fin SF can represent a significant amount of square footage in a property we should not exclude these values. Since we already know this is not encoded by any of the basement sf features we will add it to Gr Liv Area before dropping the feature.

In [75]:
df['Gr Liv Area'] = df['Low Qual Fin SF']+df['Gr Liv Area']

In [76]:
df.drop('Low Qual Fin SF',axis=1,inplace=True)

Combining Half Bath and Full Bath to a single feature Baths before dropping both columns.

In [77]:
df['Baths']= (df['Half Bath']*0.5)+df['Full Bath']

In [78]:
df.drop(['Half Bath', 'Full Bath'], axis=1,inplace=True)

In [79]:
df.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
     

In [80]:
df.drop('Electrical',axis=1, inplace=True)

# Preprocessing
One-hot encode categorical variables.
Train/test split your data.
Scale your data.
Consider using automated feature selection.

In [81]:
num_col = df._get_numeric_data().columns

In [82]:
num_col

Index(['MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
       'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Paved Drive',
       'Wood Deck SF', 'Open Porch SF', 'Pool Area', 'Misc Val', 'SalePrice',
       'Basement', 'Bsmt Baths', 'Paved Street', 'Porch', 'Baths'],
      dtype='object')

In [83]:
df.drop(num_col,axis=1).columns

Index(['MS Zoning', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type',
       'Garage Finish', 'Garage Qual', 'Garage Cond', 'Fence', 'Misc Feature',
       'Sale Type'],
      dtype='object')

Creating dummy variables for categorical features.

Recoding MS SubClass since numeric values imply ordinal feature instead of categorical.

In [84]:
df['MS SubClass']=df['MS SubClass'].map({20:'1_1946+',
                                         30:'1_1945-',
                                         40:'1_fin_attic',
                                         45:'1.5_unfin',
                                         50:'1.5_fin',
                                         60:'2_1946+',
                                         70:'2_1945-',
                                         75:'2.5_all',
                                         80:'split_multi',
                                         85:'split_foyer',
                                         90:'duplex_all',
                                         120:'1_pud',
                                         150:'1.5_pud',
                                         160:'2_pud_1946+',
                                         180:'multi_pud',
                                         190:'2_fam_conv'
                                         })

Dropping House Style since already encoded in MS SubCLass.

In [85]:
df.drop('House Style', axis=1, inplace=True)

Combining 2 feature columns (ex. Condition 1 / Condition 2) and consolidating redundant nominal categories.

In [86]:
df[['Condition 1','Condition 2']][df['Condition 1'] != df['Condition 2']].head()

Unnamed: 0_level_0,Condition 1,Condition 2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
109,RRAe,Norm
138,PosA,Norm
1942,Artery,Norm
807,Feedr,Norm
770,Feedr,Norm


In [87]:
df.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
       'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd',
       'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond',
       'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF',
       'Pool Area

In [88]:
def two_to_one(col1, col2, res_col):
    feat_list = set((list(df[col1].unique())+(list(df[col2].unique()))))
    foo = pd.get_dummies(data=df, columns=[col1,col2])
    
    for feat in feat_list:
        cols = foo.columns.str.contains(feat)
        foo[f'{res_col}_{feat}'] = foo[foo.columns[cols]].max(1)
       
    return foo

In [89]:
df = two_to_one('Condition 1', 'Condition 2', 'Condition')

In [90]:
df.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [91]:
def comb_dummy(str, res_col):
    cols = df.columns.str.contains(str)
    df[res_col] = df[df.columns[cols]].max(1)
    
    return df.columns

In [92]:
comb_dummy('RR', 'Condition_RR')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [93]:
comb_dummy('Artery|Feedr','Condition_Street')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [94]:
comb_dummy('Pos','Condition_Park_Sch')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [95]:
drop_list = df.columns.str.contains('Artery|Feedr|RR.|Pos|Condition\s\d')
df.columns[drop_list]

Index(['Condition 1_Artery', 'Condition 1_Feedr', 'Condition 1_Norm',
       'Condition 1_PosA', 'Condition 1_PosN', 'Condition 1_RRAe',
       'Condition 1_RRAn', 'Condition 1_RRNe', 'Condition 1_RRNn',
       'Condition 2_Artery', 'Condition 2_Feedr', 'Condition 2_Norm',
       'Condition 2_PosA', 'Condition 2_PosN', 'Condition 2_RRAe',
       'Condition 2_RRAn', 'Condition 2_RRNn', 'Condition_RRNe',
       'Condition_PosA', 'Condition_RRAn', 'Condition_Feedr',
       'Condition_Artery', 'Condition_RRAe', 'Condition_PosN',
       'Condition_RRNn'],
      dtype='object')

In [96]:
df.drop(df.columns[drop_list], axis=1, inplace=True)

In [97]:
df.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [98]:
set((list(df['Exterior 1st'].unique())+(list(df['Exterior 2nd'].unique()))))

{'AsbShng',
 'AsphShn',
 'Brk Cmn',
 'BrkComm',
 'BrkFace',
 'CBlock',
 'CemntBd',
 'CmentBd',
 'HdBoard',
 'ImStucc',
 'MetalSd',
 'Plywood',
 'Stone',
 'Stucco',
 'VinylSd',
 'Wd Sdng',
 'Wd Shng',
 'WdShing'}

In [99]:
df = two_to_one('Exterior 1st','Exterior 2nd', 'Exterior')

In [100]:
comb_dummy('_As|\sAs', 'Exterior_Asph')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_AsphShn', 'Exterior_Stucco', 'Exterior_VinylSd',
       'Exterior_BrkComm', 'Exterior_ImStucc', 'Exterior_Plywood',
       'Exterior_BrkFace', 'Exterior_HdBoard', 'Exterior_AsbShng',
       'Exterior_Asph'],
      dtype='object', length=119)

In [101]:
comb_dummy('Wd', 'Exterior_Wd')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_Stucco', 'Exterior_VinylSd', 'Exterior_BrkComm',
       'Exterior_ImStucc', 'Exterior_Plywood', 'Exterior_BrkFace',
       'Exterior_HdBoard', 'Exterior_AsbShng', 'Exterior_Asph', 'Exterior_Wd'],
      dtype='object', length=120)

In [102]:
comb_dummy('Brk','Exterior_Brk')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_VinylSd', 'Exterior_BrkComm', 'Exterior_ImStucc',
       'Exterior_Plywood', 'Exterior_BrkFace', 'Exterior_HdBoard',
       'Exterior_AsbShng', 'Exterior_Asph', 'Exterior_Wd', 'Exterior_Brk'],
      dtype='object', length=121)

In [103]:
comb_dummy('ntBd', 'Exterior_Cmnt')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_BrkComm', 'Exterior_ImStucc', 'Exterior_Plywood',
       'Exterior_BrkFace', 'Exterior_HdBoard', 'Exterior_AsbShng',
       'Exterior_Asph', 'Exterior_Wd', 'Exterior_Brk', 'Exterior_Cmnt'],
      dtype='object', length=122)

In [104]:
drop_list = df.columns.str.contains('Exterior\s|ntBd|Wd\s|Brk.|Asb|Asphs')
df.columns[drop_list]

Index(['Exterior 1st_AsbShng', 'Exterior 1st_AsphShn', 'Exterior 1st_BrkComm',
       'Exterior 1st_BrkFace', 'Exterior 1st_CBlock', 'Exterior 1st_CemntBd',
       'Exterior 1st_HdBoard', 'Exterior 1st_ImStucc', 'Exterior 1st_MetalSd',
       'Exterior 1st_Plywood', 'Exterior 1st_Stone', 'Exterior 1st_Stucco',
       'Exterior 1st_VinylSd', 'Exterior 1st_Wd Sdng', 'Exterior 1st_WdShing',
       'Exterior 2nd_AsbShng', 'Exterior 2nd_AsphShn', 'Exterior 2nd_Brk Cmn',
       'Exterior 2nd_BrkFace', 'Exterior 2nd_CBlock', 'Exterior 2nd_CmentBd',
       'Exterior 2nd_HdBoard', 'Exterior 2nd_ImStucc', 'Exterior 2nd_MetalSd',
       'Exterior 2nd_Plywood', 'Exterior 2nd_Stone', 'Exterior 2nd_Stucco',
       'Exterior 2nd_VinylSd', 'Exterior 2nd_Wd Sdng', 'Exterior 2nd_Wd Shng',
       'Exterior_Wd Shng', 'Exterior_CemntBd', 'Exterior_Wd Sdng',
       'Exterior_CmentBd', 'Exterior_Brk Cmn', 'Exterior_BrkComm',
       'Exterior_BrkFace', 'Exterior_AsbShng'],
      dtype='object')

In [105]:
df.drop(df.columns[drop_list], axis=1,inplace=True)

In [106]:
set((list(df['BsmtFin Type 1'].unique())+(list(df['BsmtFin Type 2'].unique()))))

{'ALQ', 'BLQ', 'GLQ', 'LwQ', 'Rec', 'Unf', 'none'}

In [107]:
df = two_to_one('BsmtFin Type 1','BsmtFin Type 2', 'Bsmt Fin')

In [108]:
drop_list = df.columns.str.contains('BsmtFin Type')
df.columns[drop_list]

Index(['BsmtFin Type 1_ALQ', 'BsmtFin Type 1_BLQ', 'BsmtFin Type 1_GLQ',
       'BsmtFin Type 1_LwQ', 'BsmtFin Type 1_Rec', 'BsmtFin Type 1_Unf',
       'BsmtFin Type 1_none', 'BsmtFin Type 2_ALQ', 'BsmtFin Type 2_BLQ',
       'BsmtFin Type 2_GLQ', 'BsmtFin Type 2_LwQ', 'BsmtFin Type 2_Rec',
       'BsmtFin Type 2_Unf', 'BsmtFin Type 2_none'],
      dtype='object')

In [109]:
df.drop(df.columns[drop_list],axis=1,inplace=True)

Combining BsmtFin SF1 and BsmtFin SF2 into BsmtFin SF before dropping features.

In [110]:
df['BsmtFin SF'] = df['BsmtFin SF 1'] + df['BsmtFin SF 2']

In [111]:
df.drop(['BsmtFin SF 1', 'BsmtFin SF 2'], axis=1, inplace=True)

In [112]:
df.dtypes.value_counts()

object     31
uint8      24
int64      23
float64    10
dtype: int64

In [113]:
df = pd.get_dummies(data=df, columns=['MS SubClass',
                                      'MS Zoning',
                                      'Alley',
                                      'Lot Shape',
                                      'Land Contour',
                                      'Utilities',
                                      'Lot Config',
                                      'Land Slope', 
                                      'Neighborhood', 
                                      'Bldg Type',  
                                      'Roof Style', 
                                      'Roof Matl',
                                      'Mas Vnr Type', 
                                      'Exter Qual',
                                      'Exter Cond', 
                                      'Foundation', 
                                      'Bsmt Qual', 
                                      'Bsmt Cond', 
                                      'Bsmt Exposure',
                                      'Heating', 
                                      'Heating QC',
                                      'Kitchen Qual', 
                                      'Functional',
                                      'Fireplace Qu', 
                                      'Garage Type', 
                                      'Garage Finish', 
                                      'Garage Qual',
                                      'Garage Cond', 
                                      'Fence', 
                                      'Misc Feature', 
                                      'Sale Type',
                                      ])

Confirming correct merge and feature consolidation.

In [114]:
df.dtypes.value_counts()

uint8      222
int64       23
float64     10
dtype: int64

In [115]:
df_cols = df.columns.tolist()

In [116]:
df.to_csv('../data/train_clean.csv', index_label='Id')

# CONTINUE TO NOTEBOOK 3