In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/train.csv")
df.head(15)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [2]:
print(df.shape)
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

(1460, 81)
               Missing  Percent
PoolQC            1453    99.52
MiscFeature       1406    96.30
Alley             1369    93.77
Fence             1179    80.75
MasVnrType         872    59.73
...                ...      ...
MoSold               0     0.00
YrSold               0     0.00
SaleType             0     0.00
SaleCondition        0     0.00
SalePrice            0     0.00

[81 rows x 2 columns]


In [3]:
df['PoolQC'].unique()

array([nan, 'Ex', 'Fa', 'Gd'], dtype=object)

In [4]:
df['PoolQC'] = df['PoolQC'].fillna('None')
pool_mapping = {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}
df['PoolQC'] = df['PoolQC'].map(pool_mapping)
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

               Missing  Percent
MiscFeature       1406    96.30
Alley             1369    93.77
Fence             1179    80.75
MasVnrType         872    59.73
FireplaceQu        690    47.26
...                ...      ...
MoSold               0     0.00
YrSold               0     0.00
SaleType             0     0.00
SaleCondition        0     0.00
SalePrice            0     0.00

[81 rows x 2 columns]


In [5]:
df['MiscFeature'].unique()

array([nan, 'Shed', 'Gar2', 'Othr', 'TenC'], dtype=object)

In [6]:
df['Alley'].unique()

array([nan, 'Grvl', 'Pave'], dtype=object)

In [7]:
df['Fence'].unique()

array([nan, 'MnPrv', 'GdWo', 'GdPrv', 'MnWw'], dtype=object)

In [8]:
df['MasVnrType'].unique()

array(['BrkFace', nan, 'Stone', 'BrkCmn'], dtype=object)

In [9]:
df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
df['FireplaceQu'].unique()

array(['None', 'TA', 'Gd', 'Fa', 'Ex', 'Po'], dtype=object)

In [10]:
fp_mapping = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['FireplaceQu'] = df['FireplaceQu'].map(fp_mapping)

In [11]:
# one-hot encoding for nominal categorical variables
df = pd.get_dummies(df, columns=['MiscFeature'], prefix='MiscFeature', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Alley'], prefix='Alley', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Fence'], prefix='Fence', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['MasVnrType'], prefix='MasVnrType', dummy_na=False, dtype=int)

df.head(50)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,MiscFeature_TenC,Alley_Grvl,Alley_Pave,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_Stone
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,1,0
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,0,0,0,0,0
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,1,0
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,0,0,0,0,0
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,0,0,0,1,0
5,6,50,RL,85.0,14115,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,1,0,0,0,0
6,7,20,RL,75.0,10084,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,1
7,8,60,RL,,10382,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,0,0,0,0,1
8,9,50,RM,51.0,6120,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,0
9,10,190,RL,50.0,7420,Pave,Reg,Lvl,AllPub,Corner,...,0,0,0,0,0,0,0,0,0,0


In [12]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                    Missing  Percent
LotFrontage             259    17.74
GarageType               81     5.55
GarageFinish             81     5.55
GarageQual               81     5.55
GarageYrBlt              81     5.55
...                     ...      ...
Fence_MnPrv               0     0.00
Fence_MnWw                0     0.00
MasVnrType_BrkCmn         0     0.00
MasVnrType_BrkFace        0     0.00
MasVnrType_Stone          0     0.00

[90 rows x 2 columns]


In [13]:
df['LotFrontage'] = df['LotFrontage'].fillna(0)
df['GarageType'].unique()

array(['Attchd', 'Detchd', 'BuiltIn', 'CarPort', nan, 'Basment', '2Types'],
      dtype=object)

In [14]:
df = pd.get_dummies(df, columns=['GarageType'], prefix='GarageType', dummy_na=False, dtype=int)

df['GarageFinish'] = df['GarageFinish'].fillna('None')
gar_fin_mapping = {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
df['GarageFinish'] = df['GarageFinish'].map(gar_fin_mapping)

df['GarageQual'] = df['GarageQual'].fillna('None')
df['GarageCond'] = df['GarageCond'].fillna('None')
df['GarageQual'] = df['GarageQual'].map(fp_mapping)
df['GarageCond'] = df['GarageCond'].map(fp_mapping)

df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])

In [15]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                    Missing  Percent
BsmtExposure             38     2.60
BsmtFinType2             38     2.60
BsmtQual                 37     2.53
BsmtFinType1             37     2.53
BsmtCond                 37     2.53
...                     ...      ...
GarageType_Attchd         0     0.00
GarageType_Basment        0     0.00
GarageType_BuiltIn        0     0.00
GarageType_CarPort        0     0.00
GarageType_Detchd         0     0.00

[95 rows x 2 columns]


In [16]:
df['BsmtExposure'] = df['BsmtExposure'].fillna('None')
bexp_mapping = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
df['BsmtExposure'] = df['BsmtExposure'].map(bexp_mapping)

df['BsmtFinType1'] = df['BsmtFinType1'].fillna('None')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('None')
bfin_mapping = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ':6}
df['BsmtFinType1'] = df['BsmtFinType1'].map(bfin_mapping)
df['BsmtFinType2'] = df['BsmtFinType2'].map(bfin_mapping)

df['BsmtQual'] = df['BsmtQual'].fillna('None')
df['BsmtCond'] = df['BsmtCond'].fillna('None')
df['BsmtQual'] = df['BsmtQual'].map(fp_mapping)
df['BsmtCond'] = df['BsmtCond'].map(fp_mapping)

In [17]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                    Missing  Percent
MasVnrArea                8     0.55
Electrical                1     0.07
MSSubClass                0     0.00
LotFrontage               0     0.00
LotArea                   0     0.00
...                     ...      ...
GarageType_Attchd         0     0.00
GarageType_Basment        0     0.00
GarageType_BuiltIn        0     0.00
GarageType_CarPort        0     0.00
GarageType_Detchd         0     0.00

[95 rows x 2 columns]


In [18]:
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

In [19]:
print(df.dtypes.to_string())

Id                      int64
MSSubClass              int64
MSZoning               object
LotFrontage           float64
LotArea                 int64
Street                 object
LotShape               object
LandContour            object
Utilities              object
LotConfig              object
LandSlope              object
Neighborhood           object
Condition1             object
Condition2             object
BldgType               object
HouseStyle             object
OverallQual             int64
OverallCond             int64
YearBuilt               int64
YearRemodAdd            int64
RoofStyle              object
RoofMatl               object
Exterior1st            object
Exterior2nd            object
MasVnrArea            float64
ExterQual              object
ExterCond              object
Foundation             object
BsmtQual                int64
BsmtCond                int64
BsmtExposure            int64
BsmtFinType1            int64
BsmtFinSF1              int64
BsmtFinTyp

In [20]:
print(df['Electrical'].unique())

['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]


In [21]:
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
electrical_mapping = {
    'FuseP': 1,
    'FuseF': 2,  
    'Mix': 3,    
    'FuseA': 4, 
    'SBrkr': 5  
}
df['Electrical'] = df['Electrical'].map(electrical_mapping)

In [22]:
df = pd.get_dummies(df, columns=['MSZoning'], prefix='MSZoning', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Street'], prefix='Street', dummy_na=False, dtype=int)

lot_shape_mapping = {
    'IR3': 1, 
    'IR2': 2, 
    'IR1': 3,  
    'Reg': 4  
}
df['LotShape'] = df['LotShape'].map(lot_shape_mapping)

df = pd.get_dummies(df, columns=['LandContour'], prefix='LandContour', dummy_na=False, dtype=int)

utilities_mapping = {
    'ELO': 1,     
    'NoSeWa': 2,
    'NoSewr': 3,  
    'AllPub': 4  
}
df['Utilities'] = df['Utilities'].map(utilities_mapping)

df = pd.get_dummies(df, columns=['LotConfig'], prefix='LotConfig', dummy_na=False, dtype=int)

slope_mapping = {
    'Sev': 1,     
    'Mod': 2,
    'Gtl': 3
}
df['LandSlope'] = df['LandSlope'].map(slope_mapping)

df = pd.get_dummies(df, columns=['Neighborhood'], prefix='Neighborhood', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Condition1'], prefix='Condition1', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Condition2'], prefix='Condition2', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['BldgType'], prefix='BldgType', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['HouseStyle'], prefix='HouseStyle', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['RoofStyle'], prefix='RoofStyle', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['RoofMatl'], prefix='RoofMatl', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Exterior1st'], prefix='Exterior1st', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Exterior2nd'], prefix='Exterior2nd', dummy_na=False, dtype=int)

df['ExterQual'] = df['ExterQual'].map(fp_mapping)
df['ExterCond'] = df['ExterCond'].map(fp_mapping)

df = pd.get_dummies(df, columns=['Foundation'], prefix='Foundation', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['Heating'], prefix='Heating', dummy_na=False, dtype=int)

df['HeatingQC'] = df['HeatingQC'].map(fp_mapping)

air_mapping = {
    'N': 0,     
    'Y': 1
}
df['CentralAir'] = df['CentralAir'].map(air_mapping)

df['KitchenQual'] = df['KitchenQual'].map(fp_mapping)

functional_mapping = {
    'Sal': 1, 
    'Sev': 2,  
    'Maj2': 3,
    'Maj1': 4,  
    'Mod': 5,   
    'Min2': 6, 
    'Min1': 7, 
    'Typ': 8  
}
df['Functional'] = df['Functional'].map(functional_mapping)

pave_mapping = {
    'N': 0,  
    'P': 1,
    'Y': 2
}
df['PavedDrive'] = df['PavedDrive'].map(pave_mapping)

df = pd.get_dummies(df, columns=['SaleType'], prefix='SaleType', dummy_na=False, dtype=int)
df = pd.get_dummies(df, columns=['SaleCondition'], prefix='SaleCondition', dummy_na=False, dtype=int)

In [23]:
print(df.dtypes.to_string())

Id                         int64
MSSubClass                 int64
LotFrontage              float64
LotArea                    int64
LotShape                   int64
Utilities                  int64
LandSlope                  int64
OverallQual                int64
OverallCond                int64
YearBuilt                  int64
YearRemodAdd               int64
MasVnrArea               float64
ExterQual                  int64
ExterCond                  int64
BsmtQual                   int64
BsmtCond                   int64
BsmtExposure               int64
BsmtFinType1               int64
BsmtFinSF1                 int64
BsmtFinType2               int64
BsmtFinSF2                 int64
BsmtUnfSF                  int64
TotalBsmtSF                int64
HeatingQC                  int64
CentralAir                 int64
Electrical                 int64
1stFlrSF                   int64
2ndFlrSF                   int64
LowQualFinSF               int64
GrLivArea                  int64
BsmtFullBa

In [24]:
print(pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing', ascending=False))

                       Missing  Percent
Id                           0      0.0
MSSubClass                   0      0.0
LotFrontage                  0      0.0
LotArea                      0      0.0
LotShape                     0      0.0
...                        ...      ...
SaleCondition_AdjLand        0      0.0
SaleCondition_Alloca         0      0.0
SaleCondition_Family         0      0.0
SaleCondition_Normal         0      0.0
SaleCondition_Partial        0      0.0

[221 rows x 2 columns]


In [25]:
df.to_csv('train_cleaned.csv', index=False) 