# NOTEBOOK 04a: KAGGLE - Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pickle
import re
import time

from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFromModel, f_regression, RFECV
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

np.random.seed(42)

%matplotlib inline

In [2]:
now ='1544174228'

In [3]:
def extract_element_name(file_path, now):
    return re.findall(f'/([^/]*)_{now}', file_path)[0]

In [4]:
def make_file_dict(now):
    file_dict = {}
    file_list = !ls ../*/*'{now}'*
    for file in file_list:
        file_dict[extract_element_name(file, now)] = file
    return file_dict

In [5]:
file_dict = make_file_dict(f'{now}')

In [6]:
file_dict

{'columns': '../assets/columns_1544174228.pkl',
 'gs': '../assets/gs_1544174228.pkl'}

In [7]:
# use pickle.load to load in these assets
with open(f'../assets/columns_{now}.pkl', 'rb') as f:
    columns = pickle.load(f)
    
with open(f'../assets/gs_{now}.pkl', 'rb') as f:
    gs = pickle.load(f)

In [8]:
gs

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('var_thresh', VarianceThreshold(threshold=0)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kbest', SelectKBest(k=37, score_func=<function f_regression at 0x1a2103cf28>)), ('ridge', RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'var_thresh__threshold': [0, 0.001, 0.01, 0.02, 0.05], 'kbest__k': [5, 7, 13, 17, 23, 37, 53, 79, 'all'], 'ridge__alphas': [array([1.00000e-05, 1.88965e-05, ..., 5.29198e+49, 1.00000e+50])]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=1)

In [9]:
columns

Index(['Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'Bsmt Unf SF',
       'Total Bsmt SF', 'Central Air',
       ...
       'Misc Feature_none', 'Sale Type_COD', 'Sale Type_CWD', 'Sale Type_Con',
       'Sale Type_ConLD', 'Sale Type_ConLI', 'Sale Type_ConLw',
       'Sale Type_New', 'Sale Type_Oth', 'Sale Type_WD '],
      dtype='object', length=254)

In [10]:
kaggle = pd.read_csv('../data/test.csv', index_col='Id')

In [11]:
kaggle.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,,,,0,4,2006,WD
2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,8,2006,WD
2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,9,2006,New
1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,...,0,0,0,,,,0,7,2007,WD
625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,...,0,185,0,,,,0,7,2009,WD


In [12]:
kaggle.isna().sum()

PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      160
Lot Area            0
Street              0
Alley             821
Lot Shape           0
Land Contour        0
Utilities           0
Lot Config          0
Land Slope          0
Neighborhood        0
Condition 1         0
Condition 2         0
Bldg Type           0
House Style         0
Overall Qual        0
Overall Cond        0
Year Built          0
Year Remod/Add      0
Roof Style          0
Roof Matl           0
Exterior 1st        0
Exterior 2nd        0
Mas Vnr Type        1
Mas Vnr Area        1
Exter Qual          0
Exter Cond          0
Foundation          0
                 ... 
Full Bath           0
Half Bath           0
Bedroom AbvGr       0
Kitchen AbvGr       0
Kitchen Qual        0
TotRms AbvGrd       0
Functional          0
Fireplaces          0
Fireplace Qu      422
Garage Type        44
Garage Yr Blt      45
Garage Finish      45
Garage Cars         0
Garage Area         0
Garage Qua

## Data Cleaning / Converting Null Values

In [13]:
df = kaggle

In [14]:
df.drop(['PID'], axis=1, inplace=True)

In [15]:
df.drop(['Mo Sold','Yr Sold'], axis=1, inplace=True)

In [16]:
df.drop(['Pool QC'], axis=1, inplace=True)

In [17]:
df['Misc Feature'].fillna('none',inplace=True)

In [18]:
df['Alley'].fillna(value='none', inplace=True)

In [19]:
df['Fence'].fillna(value='none', inplace=True)

In [20]:
df['Fireplace Qu'].fillna('none', inplace=True)

In [21]:
df['Lot Frontage'].fillna(value=0.0, inplace=True)

In [22]:
df['Garage Cond'].fillna(value='none', inplace=True)
df['Garage Qual'].fillna(value='none', inplace=True)
df['Garage Yr Blt'].fillna(value=df['Year Built'], inplace=True)
df['Garage Finish'].fillna(value='none', inplace=True)
df['Garage Type'].fillna(value='none', inplace=True)

In [23]:
df['Garage Yr Blt'] = df['Garage Yr Blt'].astype(int)

In [24]:
df['Basement'] = df['Bsmt Qual'].map(lambda x: 1 if type(x)==str else 0)

In [25]:
df['Bsmt Exposure'].fillna(value='none', inplace=True)
df['BsmtFin Type 2'].fillna(value='none', inplace=True)
df['Bsmt Qual'].fillna(value='none', inplace=True)
df['BsmtFin Type 1'].fillna(value='none', inplace=True)
df['Bsmt Cond'].fillna(value='none', inplace=True)

In [26]:
df['Mas Vnr Type'].fillna(value='none', inplace=True)
df['Mas Vnr Area'].fillna(value=0, inplace=True)

In [27]:
df['Bsmt Half Bath'].fillna(value=0, inplace=True)
df['Bsmt Full Bath'].fillna(value=0, inplace=True)
df['Total Bsmt SF'].fillna(value=0.0, inplace=True)
df['Bsmt Unf SF'].fillna(value=0.0, inplace=True)
df['BsmtFin SF 1'].fillna(value=0.0, inplace=True)
df['BsmtFin SF 2'].fillna(value=0.0, inplace=True)

In [28]:
df['Bsmt Baths']=(df['Bsmt Half Bath']*0.5)+df['Bsmt Full Bath']

In [29]:
df.drop(['Bsmt Half Bath','Bsmt Full Bath'], axis=1, inplace=True)

In [30]:
df['Garage Cars'].fillna(value=0, inplace=True)
df['Garage Area'].fillna(value=0.0, inplace=True)

In [31]:
df.drop('Electrical',axis=1, inplace=True)

Confirming that all NaNs have now been corrected.

In [32]:
df.isna().sum().sum()

0

In [33]:
df.isna().sum().sort_values(ascending=False)

Bsmt Baths         0
Year Built         0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Mas Vnr Type       0
Mas Vnr Area       0
Exter Qual         0
Exter Cond         0
Foundation         0
Bsmt Qual          0
Bsmt Cond          0
Bsmt Exposure      0
BsmtFin Type 1     0
BsmtFin SF 1       0
BsmtFin Type 2     0
Year Remod/Add     0
Overall Cond       0
Basement           0
Overall Qual       0
MS Zoning          0
Lot Frontage       0
Lot Area           0
Street             0
Alley              0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
                  ..
Paved Drive        0
Wood Deck SF       0
Open Porch SF      0
Enclosed Porch     0
3Ssn Porch         0
Screen Porch       0
Pool Area          0
Fence              0
Misc Feature       0
Misc Val           0
Sale Type          0
Garage Finish      0
Garage Yr Blt      0
Garage Type        0
Full Bath          0
Heating QC         0
Central Air  

# Feature Engineering

In [34]:
df['Central Air'] = df['Central Air'].map({'N':0, 'Y':1})

In [35]:
df['Paved Street'] = df['Street'].map({'Grvl':0, 'Pave':1})

In [36]:
df.drop('Street', axis=1, inplace=True)

In [37]:
df['Paved Drive'] = df['Paved Drive'].map({'N':0,'P':0,'Y':1})

In [38]:
df['Porch'] = df['Enclosed Porch']+df['3Ssn Porch']+df['Screen Porch']

In [39]:
df['Porch']=df['Porch'].map(lambda x: 0 if x==0 else 1)

In [40]:
df.drop(['Enclosed Porch','3Ssn Porch','Screen Porch'], axis=1, inplace=True)

In [41]:
df['Gr Liv Area'] = df['Low Qual Fin SF']+df['Gr Liv Area']

In [42]:
df.drop('Low Qual Fin SF',axis=1,inplace=True)

In [43]:
df['Baths']= (df['Half Bath']*0.5)+df['Full Bath']

In [44]:
df.drop(['Half Bath', 'Full Bath'], axis=1,inplace=True)

In [45]:
df.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF',
  

# Pre-processing
One-hot encode categorical variables.
Train/test split your data.
Scale your data.
Consider using automated feature selection.

In [46]:
df['MS SubClass']=df['MS SubClass'].map({20:'1_1946+',
                                         30:'1_1945-',
                                         40:'1_fin_attic',
                                         45:'1.5_unfin',
                                         50:'1.5_fin',
                                         60:'2_1946+',
                                         70:'2_1945-',
                                         75:'2.5_all',
                                         80:'split_multi',
                                         85:'split_foyer',
                                         90:'duplex_all',
                                         120:'1_pud',
                                         150:'1.5_pud',
                                         160:'2_pud_1946+',
                                         180:'multi_pud',
                                         190:'2_fam_conv'
                                         })

In [47]:
df.drop('House Style', axis=1, inplace=True)

In [48]:
def two_to_one(col1, col2, res_col):
    feat_list = set((list(df[col1].unique())+(list(df[col2].unique()))))
    foo = pd.get_dummies(data=df, columns=[col1,col2])
    
    for feat in feat_list:
        cols = foo.columns.str.contains(feat)
        foo[f'{res_col}_{feat}'] = foo[foo.columns[cols]].max(1)
       
    return foo

In [49]:
df = two_to_one('Condition 1', 'Condition 2', 'Condition')

In [50]:
def comb_dummy(str, res_col):
    cols = df.columns.str.contains(str)
    df[res_col] = df[df.columns[cols]].max(1)
    
    return df.columns

In [51]:
comb_dummy('RR', 'Condition_RR')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [52]:
comb_dummy('Artery|Feedr','Condition_Street')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [53]:
comb_dummy('Pos','Condition_Park_Sch')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Bldg Type', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2',
       'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC',
       'Central Air', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck SF',
       'Open Porch SF', 'Pool Area', 'Fence', 'Misc Feature', 'Misc Val

In [54]:
drop_list = df.columns.str.contains('Artery|Feedr|RR.|Pos|Condition\s\d')
df.columns[drop_list]

Index(['Condition 1_Artery', 'Condition 1_Feedr', 'Condition 1_Norm',
       'Condition 1_PosA', 'Condition 1_PosN', 'Condition 1_RRAe',
       'Condition 1_RRAn', 'Condition 1_RRNe', 'Condition 1_RRNn',
       'Condition 2_Feedr', 'Condition 2_Norm', 'Condition 2_PosA',
       'Condition 2_PosN', 'Condition_Artery', 'Condition_RRNe',
       'Condition_PosN', 'Condition_RRAn', 'Condition_PosA', 'Condition_Feedr',
       'Condition_RRNn', 'Condition_RRAe'],
      dtype='object')

In [55]:
df.drop(df.columns[drop_list], axis=1, inplace=True)

In [56]:
df = two_to_one('Exterior 1st','Exterior 2nd', 'Exterior')

In [57]:
comb_dummy('_As|\sAs', 'Exterior_Asph')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_CemntBd', 'Exterior_HdBoard', 'Exterior_AsbShng',
       'Exterior_AsphShn', 'Exterior_Wd Sdng', 'Exterior_BrkFace',
       'Exterior_VinylSd', 'Exterior_CmentBd', 'Exterior_CBlock',
       'Exterior_Asph'],
      dtype='object', length=118)

In [58]:
comb_dummy('Wd', 'Exterior_Wd')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_HdBoard', 'Exterior_AsbShng', 'Exterior_AsphShn',
       'Exterior_Wd Sdng', 'Exterior_BrkFace', 'Exterior_VinylSd',
       'Exterior_CmentBd', 'Exterior_CBlock', 'Exterior_Asph', 'Exterior_Wd'],
      dtype='object', length=119)

In [59]:
comb_dummy('Brk','Exterior_Brk')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_AsbShng', 'Exterior_AsphShn', 'Exterior_Wd Sdng',
       'Exterior_BrkFace', 'Exterior_VinylSd', 'Exterior_CmentBd',
       'Exterior_CBlock', 'Exterior_Asph', 'Exterior_Wd', 'Exterior_Brk'],
      dtype='object', length=120)

In [60]:
comb_dummy('ntBd', 'Exterior_Cmnt')

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Alley',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       ...
       'Exterior_AsphShn', 'Exterior_Wd Sdng', 'Exterior_BrkFace',
       'Exterior_VinylSd', 'Exterior_CmentBd', 'Exterior_CBlock',
       'Exterior_Asph', 'Exterior_Wd', 'Exterior_Brk', 'Exterior_Cmnt'],
      dtype='object', length=121)

In [61]:
drop_list = df.columns.str.contains('Exterior\s|ntBd|Wd\s|Brk.|Asb|Asphs')
df.columns[drop_list]

Index(['Exterior 1st_AsbShng', 'Exterior 1st_AsphShn', 'Exterior 1st_BrkComm',
       'Exterior 1st_BrkFace', 'Exterior 1st_CemntBd', 'Exterior 1st_HdBoard',
       'Exterior 1st_MetalSd', 'Exterior 1st_Plywood', 'Exterior 1st_PreCast',
       'Exterior 1st_Stucco', 'Exterior 1st_VinylSd', 'Exterior 1st_Wd Sdng',
       'Exterior 1st_WdShing', 'Exterior 2nd_AsbShng', 'Exterior 2nd_AsphShn',
       'Exterior 2nd_Brk Cmn', 'Exterior 2nd_BrkFace', 'Exterior 2nd_CBlock',
       'Exterior 2nd_CmentBd', 'Exterior 2nd_HdBoard', 'Exterior 2nd_ImStucc',
       'Exterior 2nd_MetalSd', 'Exterior 2nd_Other', 'Exterior 2nd_Plywood',
       'Exterior 2nd_PreCast', 'Exterior 2nd_Stucco', 'Exterior 2nd_VinylSd',
       'Exterior 2nd_Wd Sdng', 'Exterior 2nd_Wd Shng', 'Exterior_Brk Cmn',
       'Exterior_Wd Shng', 'Exterior_BrkComm', 'Exterior_CemntBd',
       'Exterior_AsbShng', 'Exterior_Wd Sdng', 'Exterior_BrkFace',
       'Exterior_CmentBd'],
      dtype='object')

In [62]:
df.drop(df.columns[drop_list], axis=1,inplace=True)

In [63]:
df = two_to_one('BsmtFin Type 1','BsmtFin Type 2', 'Bsmt Fin')

In [64]:
drop_list = df.columns.str.contains('BsmtFin Type')
df.columns[drop_list]

Index(['BsmtFin Type 1_ALQ', 'BsmtFin Type 1_BLQ', 'BsmtFin Type 1_GLQ',
       'BsmtFin Type 1_LwQ', 'BsmtFin Type 1_Rec', 'BsmtFin Type 1_Unf',
       'BsmtFin Type 1_none', 'BsmtFin Type 2_ALQ', 'BsmtFin Type 2_BLQ',
       'BsmtFin Type 2_GLQ', 'BsmtFin Type 2_LwQ', 'BsmtFin Type 2_Rec',
       'BsmtFin Type 2_Unf', 'BsmtFin Type 2_none'],
      dtype='object')

In [65]:
df.drop(df.columns[drop_list],axis=1,inplace=True)

In [66]:
df['BsmtFin SF'] = df['BsmtFin SF 1'] + df['BsmtFin SF 2']

In [67]:
df.drop(['BsmtFin SF 1', 'BsmtFin SF 2'], axis=1, inplace=True)

In [68]:
df = pd.get_dummies(data=df, columns=['MS SubClass',
                                      'MS Zoning',
                                      'Alley',
                                      'Lot Shape',
                                      'Land Contour',
                                      'Utilities',
                                      'Lot Config',
                                      'Land Slope', 
                                      'Neighborhood', 
                                      'Bldg Type',  
                                      'Roof Style', 
                                      'Roof Matl',
                                      'Mas Vnr Type', 
                                      'Exter Qual',
                                      'Exter Cond', 
                                      'Foundation', 
                                      'Bsmt Qual', 
                                      'Bsmt Cond', 
                                      'Bsmt Exposure',
                                      'Heating', 
                                      'Heating QC',
                                      'Kitchen Qual', 
                                      'Functional',
                                      'Fireplace Qu', 
                                      'Garage Type', 
                                      'Garage Finish', 
                                      'Garage Qual',
                                      'Garage Cond', 
                                      'Fence', 
                                      'Misc Feature', 
                                      'Sale Type',
                                      ])

In [69]:
df.dtypes.value_counts()

uint8      212
int64       28
float64      4
dtype: int64

In [70]:
df.columns

Index(['Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'Bsmt Unf SF',
       'Total Bsmt SF', 'Central Air',
       ...
       'Sale Type_COD', 'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD',
       'Sale Type_ConLI', 'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth',
       'Sale Type_VWD', 'Sale Type_WD '],
      dtype='object', length=244)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 879 entries, 2658 to 1939
Columns: 244 entries, Lot Frontage to Sale Type_WD 
dtypes: float64(4), int64(28), uint8(212)
memory usage: 408.6 KB


In [72]:
kag_col = set(df.columns)
train_col = set(columns)

not_seen = kag_col - train_col
not_here = train_col - kag_col

for col in not_seen:
    df.drop(col,axis=1,inplace=True)
for col in not_here:
    df[col] = 0

In [73]:
df.columns.shape

(254,)

Confirming that shape matches the training data that the model was fit on.

In [74]:
df.to_csv('../data/kaggle_clean.csv', index_label='Id')