In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

In [5]:
train_df  = pd.read_csv('datasets/train.csv')

In [6]:
train_df

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000


In [7]:
train_df.columns = [column.lower() for column in train_df.columns]
train_df.columns = [column.replace(' ','_') for column in train_df.columns]
train_df

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000


### Test Train Split

In [8]:
X = train_df.drop(columns = 'saleprice')
y = train_df[['saleprice','id']]

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

### Split DataFrame Based on Type of Feature

In [10]:
#split the data based on type to effectivly explore,impute,and encode if needed. Drop target featurer

#target feature

#categorical features
numeric_train_df = X_test.select_dtypes(include = ['int','float']).sort_index()

#categorical features
categorical_train_df =  X_test.select_dtypes(include = ['object']).sort_index()

# Numeric Features

In [11]:
#remove ID and PID, put into own DF
id_train_df = numeric_train_df[['id','pid']]

numeric_train_df = numeric_train_df.drop(columns = ['id','pid'])

In [12]:
numeric_null = numeric_train_df.isna().sum().sort_values(ascending = False)
numeric_null[numeric_null > 0]

lot_frontage     77
garage_yr_blt    26
mas_vnr_area      4
garage_cars       1
garage_area       1
dtype: int64

#### Numeric categories that could mean 0 for NA


In [13]:
def na_0_filler(df,columns):
    for column in columns:
        df[column] = df[column].fillna(0)
    return df

In [14]:
#Linear feet of street connected to property
#reasonable to assume if na, zero feet of street connected to property 
numeric_train_df[['lot_frontage']]

#Year garage was built
#reasonable to assume if na, no garage
numeric_train_df[['garage_yr_blt']]

#Masonry veneer area in square feet
#reasonable to assume if na, no Masonry veneer
numeric_train_df[['mas_vnr_area']].describe()

numeric_na_0_columns = ['garage_yr_blt','mas_vnr_area','lot_frontage']

In [15]:
na_0_filler(numeric_train_df,[numeric_na_0_columns]).head()

Unnamed: 0,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,...,garage_area,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold
2,20,68.0,7922,5,7,1953,2007,0.0,731.0,0.0,...,246.0,0,52,0,0,0,0,0,1,2010
23,20,0.0,7980,6,7,1992,2007,0.0,935.0,0.0,...,420.0,483,21,0,0,0,0,500,3,2010
25,20,92.0,10573,6,6,1961,1961,3.0,1312.0,0.0,...,530.0,0,49,0,0,288,0,0,4,2009
29,50,60.0,8064,5,7,1949,2006,0.0,0.0,0.0,...,576.0,0,0,0,0,0,0,2000,7,2007
30,20,80.0,9600,8,5,1981,1981,0.0,1104.0,0.0,...,542.0,474,120,0,0,0,0,0,7,2009


#### Numeric categories that could need inputer

In [16]:
imputer = SimpleImputer(strategy='median')

In [17]:
#all columns have one missing value, will use the median value to fill na values

#Basement half bathrooms
(numeric_train_df['bsmt_half_bath'])

#Basement full bathrooms
numeric_train_df['bsmt_full_bath']

#Total square feet of basement area
numeric_train_df['total_bsmt_sf']

#Type 1 finished square feet
numeric_train_df['bsmtfin_sf_1']

#Type 2 finished square feet
numeric_train_df['bsmtfin_sf_2']

#Unfinished square feet of basement area
numeric_train_df['bsmt_unf_sf']

imputer_columns = ['garage_cars','garage_area']

In [18]:
numeric_train_df[imputer_columns] = imputer.fit_transform(numeric_train_df[imputer_columns])
numeric_train_df

Unnamed: 0,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,...,garage_area,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold
2,20,68.0,7922,5,7,1953,2007,0.0,731.0,0.0,...,246.0,0,52,0,0,0,0,0,1,2010
23,20,0.0,7980,6,7,1992,2007,0.0,935.0,0.0,...,420.0,483,21,0,0,0,0,500,3,2010
25,20,92.0,10573,6,6,1961,1961,3.0,1312.0,0.0,...,530.0,0,49,0,0,288,0,0,4,2009
29,50,60.0,8064,5,7,1949,2006,0.0,0.0,0.0,...,576.0,0,0,0,0,0,0,2000,7,2007
30,20,80.0,9600,8,5,1981,1981,0.0,1104.0,0.0,...,542.0,474,120,0,0,0,0,0,7,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,70,50.0,6000,6,6,1939,1950,0.0,276.0,0.0,...,217.0,0,0,0,0,0,0,0,2,2007
2044,60,65.0,8453,6,5,1995,1995,38.0,362.0,0.0,...,525.0,0,70,0,0,0,0,0,4,2008
2045,20,96.0,12444,8,5,2008,2008,426.0,1336.0,0.0,...,774.0,0,66,0,304,0,0,0,11,2008
2046,20,79.0,11449,8,5,2007,2007,0.0,1011.0,0.0,...,520.0,0,276,0,0,0,0,0,1,2008


## Standard Scalar

In [19]:
ss = StandardScaler()

In [20]:
scaled_numeric_train_df = ss.fit_transform(numeric_train_df,)
scaled_numeric_train_df = pd.DataFrame(scaled_numeric_train_df,index = numeric_train_df.index, columns=numeric_train_df.columns)

In [21]:
scaled_numeric_train_df.head()

Unnamed: 0,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,...,garage_area,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold
2,-0.890963,0.328038,-0.255926,-0.788218,1.281202,-0.604709,1.064725,-0.588264,0.672512,-0.318796,...,-1.107566,-0.671564,0.065779,-0.403994,-0.112734,-0.293628,-0.055301,-0.080778,-1.859386,1.672598
23,-0.890963,-1.876406,-0.249921,-0.083765,1.281202,0.687305,1.064725,-0.588264,1.140011,-0.318796,...,-0.273197,2.805087,-0.402478,-0.403994,-0.112734,-0.293628,-0.055301,1.209759,-1.163896,1.672598
25,-0.890963,1.106077,0.018577,-0.083765,0.385757,-0.339681,-1.088633,-0.570607,2.003968,-0.318796,...,0.254278,-0.671564,0.020464,-0.403994,-0.112734,4.637239,-0.055301,-0.080778,-0.816151,0.921904
29,-0.170231,0.068691,-0.241223,-0.788218,1.281202,-0.737224,1.017913,-0.588264,-1.002694,-0.318796,...,0.474858,-0.671564,-0.719684,-0.403994,-0.112734,-0.293628,-0.055301,5.08137,0.227085,-0.579483
30,-0.890963,0.717057,-0.082174,1.325139,-0.509688,0.322891,-0.15239,-0.588264,1.527302,-0.318796,...,0.311821,2.740305,1.092924,-0.403994,-0.112734,-0.293628,-0.055301,-0.080778,0.227085,0.921904


In [22]:
#add id and PID back to numeric column
scaled_numeric_train_df = pd.concat([id_train_df,scaled_numeric_train_df], axis=1)

In [23]:
scaled_numeric_train_df

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,...,garage_area,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold
2,153,535304180,-0.890963,0.328038,-0.255926,-0.788218,1.281202,-0.604709,1.064725,-0.588264,...,-1.107566,-0.671564,0.065779,-0.403994,-0.112734,-0.293628,-0.055301,-0.080778,-1.859386,1.672598
23,12,527165230,-0.890963,-1.876406,-0.249921,-0.083765,1.281202,0.687305,1.064725,-0.588264,...,-0.273197,2.805087,-0.402478,-0.403994,-0.112734,-0.293628,-0.055301,1.209759,-1.163896,1.672598
25,624,535104120,-0.890963,1.106077,0.018577,-0.083765,0.385757,-0.339681,-1.088633,-0.570607,...,0.254278,-0.671564,0.020464,-0.403994,-0.112734,4.637239,-0.055301,-0.080778,-0.816151,0.921904
29,1957,535450190,-0.170231,0.068691,-0.241223,-0.788218,1.281202,-0.737224,1.017913,-0.588264,...,0.474858,-0.671564,-0.719684,-0.403994,-0.112734,-0.293628,-0.055301,5.081370,0.227085,-0.579483
30,380,527355150,-0.890963,0.717057,-0.082174,1.325139,-0.509688,0.322891,-0.152390,-0.588264,...,0.311821,2.740305,1.092924,-0.403994,-0.112734,-0.293628,-0.055301,-0.080778,0.227085,0.921904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,2011,903227140,0.310256,-0.255492,-0.454944,-0.083765,0.385757,-1.068509,-1.603566,-0.588264,...,-1.246628,-0.671564,-0.719684,-0.403994,-0.112734,-0.293628,-0.055301,-0.080778,-1.511641,-0.579483
2044,1140,531382090,0.070013,0.230783,-0.200943,-0.083765,-0.509688,0.786691,0.502979,-0.364610,...,0.230302,-0.671564,0.337670,-0.403994,-0.112734,-0.293628,-0.055301,-0.080778,-0.816151,0.171211
2045,1051,528102030,-0.890963,1.235750,0.212314,1.325139,-0.509688,1.217362,1.111537,1.919010,...,1.424313,-0.671564,0.277250,-0.403994,13.547790,-0.293628,-0.055301,-0.080778,1.618066,0.171211
2046,1587,921126030,-0.890963,0.684639,0.109285,1.325139,-0.509688,1.184233,1.064725,-0.588264,...,0.206326,-0.671564,3.449315,-0.403994,-0.112734,-0.293628,-0.055301,-0.080778,-1.859386,0.171211


# Categorical Features

#### Ordinal Features

In [24]:
ordinal_columns = ['alley','utilities','land_slope','exter_qual',
       'bsmt_qual', 'bsmt_cond', 'bsmt_exposure','bsmtfin_type_1',
        'bsmtfin_type_2', 'heating_qc','electrical', 'kitchen_qual', 
        'functional','fireplace_qu', 'garage_finish', 'garage_qual',
        'garage_cond', 'paved_drive', 'pool_qc', 'fence'] 

ordinal_train_df = categorical_train_df[ordinal_columns]
ordinal_train_df.sort_index()

Unnamed: 0,alley,utilities,land_slope,exter_qual,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_type_2,heating_qc,electrical,kitchen_qual,functional,fireplace_qu,garage_finish,garage_qual,garage_cond,paved_drive,pool_qc,fence
2,,AllPub,Gtl,TA,TA,TA,No,GLQ,Unf,TA,SBrkr,Gd,Typ,,Unf,TA,TA,Y,,
23,,AllPub,Gtl,TA,Gd,TA,No,ALQ,Unf,Ex,SBrkr,TA,Typ,,Fin,TA,TA,Y,,GdPrv
25,,AllPub,Gtl,TA,TA,TA,No,Rec,Unf,Ex,SBrkr,TA,Typ,TA,RFn,TA,TA,Y,,GdPrv
29,,AllPub,Gtl,TA,TA,TA,Mn,Unf,Unf,Ex,SBrkr,TA,Typ,Po,Unf,TA,TA,Y,,MnPrv
30,,AllPub,Gtl,Gd,Gd,TA,No,ALQ,Unf,TA,SBrkr,Gd,Typ,Gd,Fin,TA,TA,Y,,MnPrv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,,AllPub,Gtl,TA,TA,TA,Mn,Rec,Unf,Ex,SBrkr,TA,Typ,TA,Unf,TA,TA,Y,,GdWo
2044,,AllPub,Gtl,Gd,Gd,TA,No,GLQ,Unf,Ex,SBrkr,Gd,Typ,,RFn,TA,TA,Y,,
2045,,AllPub,Gtl,Ex,Ex,TA,Av,GLQ,Unf,Ex,SBrkr,Ex,Typ,Gd,Fin,TA,TA,Y,,
2046,,AllPub,Gtl,Gd,Gd,TA,Av,GLQ,Unf,Ex,SBrkr,Gd,Typ,Gd,Fin,TA,TA,Y,,


In [25]:
ordinal_null = ordinal_train_df.isna().sum().sort_values(ascending = False)
ordinal_null[ordinal_null > 0]

pool_qc           511
alley             484
fence             426
fireplace_qu      235
garage_qual        26
garage_cond        26
garage_finish      26
bsmt_cond          15
bsmt_exposure      15
bsmtfin_type_1     15
bsmtfin_type_2     15
bsmt_qual          15
dtype: int64

#### Ordinal categories that could mean 0 for NA

In [26]:
#Pool quality, na means (0)
ordinal_train_df['pool_qc']

#Type of alley access to property, if na means no access (0)
ordinal_train_df['alley']

#Fence quality, na means (0)
ordinal_train_df['fence']

#Fireplace quality, na means (0)
ordinal_train_df['fireplace_qu']

# Garage quality,  na means (0)
ordinal_train_df['garage_qual']

# Garage condition,  na means (0)
ordinal_train_df['garage_cond']

# Interior finish of the garage,  na means (0)
ordinal_train_df['garage_finish']

# Refers to walkout or garden level walls, na means (0)
ordinal_train_df['bsmt_exposure']

#Rating of basement finished area (if multiple types)t, na means (0)
ordinal_train_df['bsmtfin_type_2']

#Evaluates the general condition of the basement, na means (0)
ordinal_train_df['bsmt_cond']

#Rating of basement finished area, na means (0)
ordinal_train_df['bsmtfin_type_1']

#Evaluates the height of the basement, na means (0)
ordinal_train_df['bsmt_qual']    

ordinal_na_0_columns = ordinal_null[ordinal_null > 0].index

In [27]:
na_0_filler(ordinal_train_df,ordinal_na_0_columns)

Unnamed: 0,alley,utilities,land_slope,exter_qual,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_type_2,heating_qc,electrical,kitchen_qual,functional,fireplace_qu,garage_finish,garage_qual,garage_cond,paved_drive,pool_qc,fence
2,0,AllPub,Gtl,TA,TA,TA,No,GLQ,Unf,TA,SBrkr,Gd,Typ,0,Unf,TA,TA,Y,0,0
23,0,AllPub,Gtl,TA,Gd,TA,No,ALQ,Unf,Ex,SBrkr,TA,Typ,0,Fin,TA,TA,Y,0,GdPrv
25,0,AllPub,Gtl,TA,TA,TA,No,Rec,Unf,Ex,SBrkr,TA,Typ,TA,RFn,TA,TA,Y,0,GdPrv
29,0,AllPub,Gtl,TA,TA,TA,Mn,Unf,Unf,Ex,SBrkr,TA,Typ,Po,Unf,TA,TA,Y,0,MnPrv
30,0,AllPub,Gtl,Gd,Gd,TA,No,ALQ,Unf,TA,SBrkr,Gd,Typ,Gd,Fin,TA,TA,Y,0,MnPrv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,0,AllPub,Gtl,TA,TA,TA,Mn,Rec,Unf,Ex,SBrkr,TA,Typ,TA,Unf,TA,TA,Y,0,GdWo
2044,0,AllPub,Gtl,Gd,Gd,TA,No,GLQ,Unf,Ex,SBrkr,Gd,Typ,0,RFn,TA,TA,Y,0,0
2045,0,AllPub,Gtl,Ex,Ex,TA,Av,GLQ,Unf,Ex,SBrkr,Ex,Typ,Gd,Fin,TA,TA,Y,0,0
2046,0,AllPub,Gtl,Gd,Gd,TA,Av,GLQ,Unf,Ex,SBrkr,Gd,Typ,Gd,Fin,TA,TA,Y,0,0


## Nominal Features

In [28]:
nominal_train_df = categorical_train_df.drop(columns = ordinal_columns)
nominal_train_df.head()

Unnamed: 0,ms_zoning,street,lot_shape,land_contour,lot_config,neighborhood,condition_1,condition_2,bldg_type,house_style,...,exterior_1st,exterior_2nd,mas_vnr_type,exter_cond,foundation,heating,central_air,garage_type,misc_feature,sale_type
2,RL,Pave,Reg,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,...,VinylSd,VinylSd,,Gd,CBlock,GasA,Y,Detchd,,WD
23,RL,Pave,IR1,Lvl,Inside,Gilbert,Norm,Norm,1Fam,1Story,...,HdBoard,HdBoard,,Gd,PConc,GasA,Y,Attchd,Shed,WD
25,RL,Pave,IR1,Lvl,Corner,NAmes,Norm,Norm,1Fam,1Story,...,MetalSd,MetalSd,BrkFace,TA,CBlock,GasA,Y,Attchd,,WD
29,RL,Pave,Reg,Lvl,Corner,NAmes,Artery,Norm,1Fam,1.5Fin,...,MetalSd,MetalSd,,Gd,CBlock,GasA,Y,Detchd,Shed,WD
30,RL,Pave,Reg,Lvl,Inside,NWAmes,PosN,Norm,1Fam,1Story,...,BrkFace,BrkFace,,TA,PConc,GasA,Y,Attchd,,WD


In [29]:
nominal_null = nominal_train_df.isna().sum().sort_values(ascending = False)
nominal_null[nominal_null > 0]

misc_feature    502
garage_type      25
mas_vnr_type      4
dtype: int64

#### Nominal categories that could mean 0 for NA

In [30]:
#Miscellaneous feature not covered in other categories, na means 0
nominal_train_df['misc_feature']

#Garage location, na means 0
nominal_train_df['garage_type']

#Masonry veneer type, it could be none but was put in as na. Na will mean 0
nominal_train_df['mas_vnr_type'].isna().sum()

nominal_na_0_columns = nominal_null[nominal_null > 0].index

In [31]:
na_0_filler(nominal_train_df,nominal_na_0_columns)

Unnamed: 0,ms_zoning,street,lot_shape,land_contour,lot_config,neighborhood,condition_1,condition_2,bldg_type,house_style,...,exterior_1st,exterior_2nd,mas_vnr_type,exter_cond,foundation,heating,central_air,garage_type,misc_feature,sale_type
2,RL,Pave,Reg,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,...,VinylSd,VinylSd,,Gd,CBlock,GasA,Y,Detchd,0,WD
23,RL,Pave,IR1,Lvl,Inside,Gilbert,Norm,Norm,1Fam,1Story,...,HdBoard,HdBoard,,Gd,PConc,GasA,Y,Attchd,Shed,WD
25,RL,Pave,IR1,Lvl,Corner,NAmes,Norm,Norm,1Fam,1Story,...,MetalSd,MetalSd,BrkFace,TA,CBlock,GasA,Y,Attchd,0,WD
29,RL,Pave,Reg,Lvl,Corner,NAmes,Artery,Norm,1Fam,1.5Fin,...,MetalSd,MetalSd,,Gd,CBlock,GasA,Y,Detchd,Shed,WD
30,RL,Pave,Reg,Lvl,Inside,NWAmes,PosN,Norm,1Fam,1Story,...,BrkFace,BrkFace,,TA,PConc,GasA,Y,Attchd,0,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,RM,Pave,Reg,Lvl,Inside,BrkSide,Norm,Norm,1Fam,2Story,...,MetalSd,VinylSd,,TA,CBlock,GasA,Y,Attchd,0,WD
2044,RL,Pave,IR1,Lvl,Inside,SawyerW,Norm,Norm,1Fam,2Story,...,VinylSd,VinylSd,BrkFace,TA,PConc,GasA,Y,Attchd,0,WD
2045,RL,Pave,Reg,Lvl,FR2,NridgHt,Norm,Norm,1Fam,1Story,...,VinylSd,VinylSd,Stone,TA,PConc,GasA,Y,Attchd,0,New
2046,RL,Pave,IR1,HLS,Inside,Timber,Norm,Norm,1Fam,1Story,...,VinylSd,VinylSd,,TA,PConc,GasA,Y,Attchd,0,WD


## Encoding Ordinal Features

### Ordinal Dictionary

In [32]:
ordinal_col_list = ordinal_train_df.columns
print(ordinal_col_list)

Index(['alley', 'utilities', 'land_slope', 'exter_qual', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2',
       'heating_qc', 'electrical', 'kitchen_qual', 'functional',
       'fireplace_qu', 'garage_finish', 'garage_qual', 'garage_cond',
       'paved_drive', 'pool_qc', 'fence'],
      dtype='object')


In [33]:
ordinal_dict = [
    
#Alley
{'Grvl':2,'Pave':1,0:0},
#Utilities
{'AllPub':4,'NoSewr':3,'NoSeWa':2,'ELO':1},
#Land Slope
{'Gtl':3,'Mod':2,'Sev':1},
#Exter Qual
{'Ex':5,'Gd':4, 'TA':3, 'Fa':2,'Po':1},     
#Bsmt Qual
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,0:0},
#Bsmt Cond
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,0:0},
#Bsmt Exposure
{'Gd':4,'Av':3,'Mn':2,'No':1,0:0},
#BsmtFin Type 1
{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,0:0},
#BsmtFinType 2
{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,0:0},
#HeatingQC 
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},
#Electrical
{'SBrkr':5,'FuseA':4,'FuseF':3,'FuseP':2,'Mix':1},
#KitchenQual
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},
#Functional
{'Typ':8,'Min1':7,'Min2':6,'Mod':5,'Maj1':4,'Maj2':3,'Sev':2,'Sal':1},
#FireplaceQu
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,0:0},
#Garage Finish
{'Fin':3,'RFn':2,'Unf':1,0:0},
#Garage Qual
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,0:0},
#Garage Cond
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,0:0},
#Paved Drive
{'Y':3,'P':2,'N':1},
#Pool QC
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,0:0},
#Fence
{'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,0:0}

]

In [34]:
#zip column names to dictionary values
named_ordinal_dict = dict(zip(ordinal_train_df.columns,ordinal_dict))

In [35]:
def ordinal_replace(df,columns):
    new_df = df
    for column in columns:
        new_df[column] = new_df[column].map(named_ordinal_dict[column])
    return new_df

In [36]:
#replace categorty value with values
encoded_ordinal_train_df = ordinal_replace(ordinal_train_df, ordinal_train_df.columns)

In [37]:
encoded_ordinal_train_df.head()

Unnamed: 0,alley,utilities,land_slope,exter_qual,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_type_2,heating_qc,electrical,kitchen_qual,functional,fireplace_qu,garage_finish,garage_qual,garage_cond,paved_drive,pool_qc,fence
2,0,4,3,3,3,3,1,6,1,3,5,4,8,0,1,3,3,3,0,0
23,0,4,3,3,4,3,1,5,1,5,5,3,8,0,3,3,3,3,0,4
25,0,4,3,3,3,3,1,3,1,5,5,3,8,3,2,3,3,3,0,4
29,0,4,3,3,3,3,2,1,1,5,5,3,8,1,1,3,3,3,0,3
30,0,4,3,4,4,3,1,5,1,3,5,4,8,4,3,3,3,3,0,3


### Encoding Nominal Features

In [38]:
#all 0 values replaced from NA should be dropped when encoded.
dummies_nominal_train_df = pd.get_dummies(nominal_train_df, drop_first=True)

## Combine all dataframes back to one and save

In [39]:
#numerical, ordinal, and nominal data frames created earlier
dataframes = [scaled_numeric_train_df,encoded_ordinal_train_df,dummies_nominal_train_df]

In [40]:
scaled_numeric_train_df
encoded_ordinal_train_df
dummies_nominal_train_df

Unnamed: 0,ms_zoning_FV,ms_zoning_I (all),ms_zoning_RH,ms_zoning_RL,ms_zoning_RM,street_Pave,lot_shape_IR2,lot_shape_IR3,lot_shape_Reg,land_contour_HLS,...,misc_feature_Gar2,misc_feature_Shed,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
2,0,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
23,0,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
25,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
29,0,0,0,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
30,0,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,0,0,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2044,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2045,0,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2046,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [41]:
#combine on index
cleaned_test_data = pd.concat(dataframes, axis=1)
cleaned_test_data

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,...,misc_feature_Gar2,misc_feature_Shed,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
2,153,535304180,-0.890963,0.328038,-0.255926,-0.788218,1.281202,-0.604709,1.064725,-0.588264,...,0,0,0,0,0,0,0,0,0,1
23,12,527165230,-0.890963,-1.876406,-0.249921,-0.083765,1.281202,0.687305,1.064725,-0.588264,...,0,1,0,0,0,0,0,0,0,1
25,624,535104120,-0.890963,1.106077,0.018577,-0.083765,0.385757,-0.339681,-1.088633,-0.570607,...,0,0,0,0,0,0,0,0,0,1
29,1957,535450190,-0.170231,0.068691,-0.241223,-0.788218,1.281202,-0.737224,1.017913,-0.588264,...,0,1,0,0,0,0,0,0,0,1
30,380,527355150,-0.890963,0.717057,-0.082174,1.325139,-0.509688,0.322891,-0.152390,-0.588264,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,2011,903227140,0.310256,-0.255492,-0.454944,-0.083765,0.385757,-1.068509,-1.603566,-0.588264,...,0,0,0,0,0,0,0,0,0,1
2044,1140,531382090,0.070013,0.230783,-0.200943,-0.083765,-0.509688,0.786691,0.502979,-0.364610,...,0,0,0,0,0,0,0,0,0,1
2045,1051,528102030,-0.890963,1.235750,0.212314,1.325139,-0.509688,1.217362,1.111537,1.919010,...,0,0,0,0,0,0,0,1,0,0
2046,1587,921126030,-0.890963,0.684639,0.109285,1.325139,-0.509688,1.184233,1.064725,-0.588264,...,0,0,0,0,0,0,0,0,0,1


In [42]:
#combine target back to cleaned dataframe on ID
final_test_df = pd.merge(cleaned_test_data,y_test, on = 'id')
final_test_df

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,...,misc_feature_Shed,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,saleprice
0,153,535304180,-0.890963,0.328038,-0.255926,-0.788218,1.281202,-0.604709,1.064725,-0.588264,...,0,0,0,0,0,0,0,0,1,109000
1,12,527165230,-0.890963,-1.876406,-0.249921,-0.083765,1.281202,0.687305,1.064725,-0.588264,...,1,0,0,0,0,0,0,0,1,185000
2,624,535104120,-0.890963,1.106077,0.018577,-0.083765,0.385757,-0.339681,-1.088633,-0.570607,...,0,0,0,0,0,0,0,0,1,187500
3,1957,535450190,-0.170231,0.068691,-0.241223,-0.788218,1.281202,-0.737224,1.017913,-0.588264,...,1,0,0,0,0,0,0,0,1,122900
4,380,527355150,-0.890963,0.717057,-0.082174,1.325139,-0.509688,0.322891,-0.152390,-0.588264,...,0,0,0,0,0,0,0,0,1,278000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,2011,903227140,0.310256,-0.255492,-0.454944,-0.083765,0.385757,-1.068509,-1.603566,-0.588264,...,0,0,0,0,0,0,0,0,1,128000
509,1140,531382090,0.070013,0.230783,-0.200943,-0.083765,-0.509688,0.786691,0.502979,-0.364610,...,0,0,0,0,0,0,0,0,1,182000
510,1051,528102030,-0.890963,1.235750,0.212314,1.325139,-0.509688,1.217362,1.111537,1.919010,...,0,0,0,0,0,0,1,0,0,394617
511,1587,921126030,-0.890963,0.684639,0.109285,1.325139,-0.509688,1.184233,1.064725,-0.588264,...,0,0,0,0,0,0,0,0,1,298751


In [43]:
final_test_df.to_csv('datasets/final_test_df.csv',index=False)