In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/test.csv")
df.head(15)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
6,1467,20,RL,,7980,Pave,,IR1,Lvl,AllPub,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,1468,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
8,1469,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
9,1470,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [2]:
print(df.shape)
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

(1459, 80)
               Missing  Percent
PoolQC            1456    99.79
MiscFeature       1408    96.50
Alley             1352    92.67
Fence             1169    80.12
MasVnrType         894    61.27
...                ...      ...
EnclosedPorch        0     0.00
MiscVal              0     0.00
MoSold               0     0.00
YrSold               0     0.00
SaleCondition        0     0.00

[80 rows x 2 columns]


In [3]:
df['PoolQC'].unique()

array([nan, 'Ex', 'Gd'], dtype=object)

In [4]:
df['PoolQC'] = df['PoolQC'].fillna('None')
pool_mapping = {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}
df['PoolQC'] = df['PoolQC'].map(pool_mapping)
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

               Missing  Percent
MiscFeature       1408    96.50
Alley             1352    92.67
Fence             1169    80.12
MasVnrType         894    61.27
FireplaceQu        730    50.03
...                ...      ...
PoolQC               0     0.00
MiscVal              0     0.00
MoSold               0     0.00
YrSold               0     0.00
SaleCondition        0     0.00

[80 rows x 2 columns]


In [5]:
df['MiscFeature'].unique()

array([nan, 'Gar2', 'Shed', 'Othr'], dtype=object)

In [6]:
df['Alley'].unique()

array([nan, 'Pave', 'Grvl'], dtype=object)

In [7]:
df['Fence'].unique()

array(['MnPrv', nan, 'GdPrv', 'GdWo', 'MnWw'], dtype=object)

In [8]:
df['MasVnrType'].unique()

array([nan, 'BrkFace', 'Stone', 'BrkCmn'], dtype=object)

In [9]:
df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
df['FireplaceQu'].unique()

array(['None', 'TA', 'Gd', 'Po', 'Fa', 'Ex'], dtype=object)

In [10]:
fp_mapping = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['FireplaceQu'] = df['FireplaceQu'].map(fp_mapping)

In [11]:
# one-hot encoding for nominal categorical variables
df = pd.get_dummies(df, columns=['MiscFeature'], prefix='MiscFeature', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Alley'], prefix='Alley', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Fence'], prefix='Fence', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['MasVnrType'], prefix='MasVnrType', dummy_na=False, dtype=int)

df.head(50)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,MiscFeature_Shed,Alley_Grvl,Alley_Pave,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_Stone
0,1461,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,1,0,0,0,0
1,1462,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,0,0,0,1,0
2,1463,60,RL,74.0,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,1,0,0,0,0
3,1464,60,RL,78.0,9978,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,1,0
4,1465,120,RL,43.0,5005,Pave,IR1,HLS,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,0
5,1466,60,RL,75.0,10000,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,0,0,0,0,0
6,1467,20,RL,,7980,Pave,IR1,Lvl,AllPub,Inside,...,1,0,0,1,0,0,0,0,0,0
7,1468,60,RL,63.0,8402,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,0
8,1469,20,RL,85.0,10176,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,0
9,1470,20,RL,70.0,8400,Pave,Reg,Lvl,AllPub,Corner,...,0,0,0,0,0,1,0,0,0,0


In [12]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                    Missing  Percent
LotFrontage             227    15.56
GarageFinish             78     5.35
GarageCond               78     5.35
GarageQual               78     5.35
GarageYrBlt              78     5.35
...                     ...      ...
Fence_MnPrv               0     0.00
Fence_MnWw                0     0.00
MasVnrType_BrkCmn         0     0.00
MasVnrType_BrkFace        0     0.00
MasVnrType_Stone          0     0.00

[88 rows x 2 columns]


In [13]:
df['LotFrontage'] = df['LotFrontage'].fillna(0)
df['GarageType'].unique()

array(['Attchd', 'Detchd', 'BuiltIn', nan, 'Basment', '2Types', 'CarPort'],
      dtype=object)

In [14]:
df = pd.get_dummies(df, columns=['GarageType'], prefix='GarageType', dummy_na=False, dtype=int)

df['GarageFinish'] = df['GarageFinish'].fillna('None')
gar_fin_mapping = {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
df['GarageFinish'] = df['GarageFinish'].map(gar_fin_mapping)

df['GarageQual'] = df['GarageQual'].fillna('None')
df['GarageCond'] = df['GarageCond'].fillna('None')
df['GarageQual'] = df['GarageQual'].map(fp_mapping)
df['GarageCond'] = df['GarageCond'].map(fp_mapping)

df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])

In [15]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                    Missing  Percent
BsmtCond                 45     3.08
BsmtExposure             44     3.02
BsmtQual                 44     3.02
BsmtFinType2             42     2.88
BsmtFinType1             42     2.88
...                     ...      ...
GarageType_Attchd         0     0.00
GarageType_Basment        0     0.00
GarageType_BuiltIn        0     0.00
GarageType_CarPort        0     0.00
GarageType_Detchd         0     0.00

[93 rows x 2 columns]


In [16]:
df['BsmtExposure'] = df['BsmtExposure'].fillna('None')
bexp_mapping = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
df['BsmtExposure'] = df['BsmtExposure'].map(bexp_mapping)

df['BsmtFinType1'] = df['BsmtFinType1'].fillna('None')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('None')
bfin_mapping = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ':6}
df['BsmtFinType1'] = df['BsmtFinType1'].map(bfin_mapping)
df['BsmtFinType2'] = df['BsmtFinType2'].map(bfin_mapping)

df['BsmtQual'] = df['BsmtQual'].fillna('None')
df['BsmtCond'] = df['BsmtCond'].fillna('None')
df['BsmtQual'] = df['BsmtQual'].map(fp_mapping)
df['BsmtCond'] = df['BsmtCond'].map(fp_mapping)

In [17]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                    Missing  Percent
MasVnrArea               15     1.03
MSZoning                  4     0.27
Utilities                 2     0.14
Functional                2     0.14
BsmtHalfBath              2     0.14
...                     ...      ...
GarageType_Attchd         0     0.00
GarageType_Basment        0     0.00
GarageType_BuiltIn        0     0.00
GarageType_CarPort        0     0.00
GarageType_Detchd         0     0.00

[93 rows x 2 columns]


In [18]:
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

In [19]:
print(df.dtypes.to_string())

Id                      int64
MSSubClass              int64
MSZoning               object
LotFrontage           float64
LotArea                 int64
Street                 object
LotShape               object
LandContour            object
Utilities              object
LotConfig              object
LandSlope              object
Neighborhood           object
Condition1             object
Condition2             object
BldgType               object
HouseStyle             object
OverallQual             int64
OverallCond             int64
YearBuilt               int64
YearRemodAdd            int64
RoofStyle              object
RoofMatl               object
Exterior1st            object
Exterior2nd            object
MasVnrArea            float64
ExterQual              object
ExterCond              object
Foundation             object
BsmtQual                int64
BsmtCond                int64
BsmtExposure            int64
BsmtFinType1            int64
BsmtFinSF1            float64
BsmtFinTyp

In [20]:
print(df['Electrical'].unique())

['SBrkr' 'FuseA' 'FuseF' 'FuseP']


In [21]:
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
electrical_mapping = {
    'FuseP': 1,
    'FuseF': 2,  
    'Mix': 3,    
    'FuseA': 4, 
    'SBrkr': 5  
}
df['Electrical'] = df['Electrical'].map(electrical_mapping)

In [22]:
df = pd.get_dummies(df, columns=['MSZoning'], prefix='MSZoning', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Street'], prefix='Street', dummy_na=False, dtype=int)

lot_shape_mapping = {
    'IR3': 1, 
    'IR2': 2, 
    'IR1': 3,  
    'Reg': 4  
}
df['LotShape'] = df['LotShape'].map(lot_shape_mapping)

df = pd.get_dummies(df, columns=['LandContour'], prefix='LandContour', dummy_na=False, dtype=int)

utilities_mapping = {
    'ELO': 1,     
    'NoSeWa': 2,
    'NoSewr': 3,  
    'AllPub': 4  
}
df['Utilities'] = df['Utilities'].map(utilities_mapping)

df = pd.get_dummies(df, columns=['LotConfig'], prefix='LotConfig', dummy_na=False, dtype=int)

slope_mapping = {
    'Sev': 1,     
    'Mod': 2,
    'Gtl': 3
}
df['LandSlope'] = df['LandSlope'].map(slope_mapping)

df = pd.get_dummies(df, columns=['Neighborhood'], prefix='Neighborhood', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Condition1'], prefix='Condition1', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Condition2'], prefix='Condition2', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['BldgType'], prefix='BldgType', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['HouseStyle'], prefix='HouseStyle', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['RoofStyle'], prefix='RoofStyle', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['RoofMatl'], prefix='RoofMatl', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Exterior1st'], prefix='Exterior1st', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Exterior2nd'], prefix='Exterior2nd', dummy_na=False, dtype=int)

df['ExterQual'] = df['ExterQual'].map(fp_mapping)
df['ExterCond'] = df['ExterCond'].map(fp_mapping)

df = pd.get_dummies(df, columns=['Foundation'], prefix='Foundation', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Heating'], prefix='Heating', dummy_na=False, dtype=int)

df['HeatingQC'] = df['HeatingQC'].map(fp_mapping)

air_mapping = {
    'N': 0,     
    'Y': 1
}
df['CentralAir'] = df['CentralAir'].map(air_mapping)

df['KitchenQual'] = df['KitchenQual'].map(fp_mapping)

functional_mapping = {
    'Sal': 1, 
    'Sev': 2,  
    'Maj2': 3,
    'Maj1': 4,  
    'Mod': 5,   
    'Min2': 6, 
    'Min1': 7, 
    'Typ': 8  
}
df['Functional'] = df['Functional'].map(functional_mapping)

pave_mapping = {
    'N': 0,  
    'P': 1,
    'Y': 2
}
df['PavedDrive'] = df['PavedDrive'].map(pave_mapping)

df = pd.get_dummies(df, columns=['SaleType'], prefix='SaleType', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['SaleCondition'], prefix='SaleCondition', dummy_na=False, dtype=int)

In [23]:
print(df.dtypes.to_string())

Id                         int64
MSSubClass                 int64
LotFrontage              float64
LotArea                    int64
LotShape                   int64
Utilities                float64
LandSlope                  int64
OverallQual                int64
OverallCond                int64
YearBuilt                  int64
YearRemodAdd               int64
MasVnrArea               float64
ExterQual                  int64
ExterCond                  int64
BsmtQual                   int64
BsmtCond                   int64
BsmtExposure               int64
BsmtFinType1               int64
BsmtFinSF1               float64
BsmtFinType2               int64
BsmtFinSF2               float64
BsmtUnfSF                float64
TotalBsmtSF              float64
HeatingQC                  int64
CentralAir                 int64
Electrical                 int64
1stFlrSF                   int64
2ndFlrSF                   int64
LowQualFinSF               int64
GrLivArea                  int64
BsmtFullBa

In [24]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                       Missing  Percent
Utilities                    2     0.14
BsmtHalfBath                 2     0.14
Functional                   2     0.14
BsmtFullBath                 2     0.14
BsmtFinSF2                   1     0.07
...                        ...      ...
SaleCondition_AdjLand        0     0.00
SaleCondition_Alloca         0     0.00
SaleCondition_Family         0     0.00
SaleCondition_Normal         0     0.00
SaleCondition_Partial        0     0.00

[206 rows x 2 columns]


In [25]:
df.to_csv('test_imputed.csv', index=False) 