In [2]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# importing data
df=pd.read_csv("data/train.csv")
df.shape

(1451, 183)

In [3]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

## Handling Missing Data

In [None]:
import pandas as pd

def clean_data(df):
    # Replace missing values with the median of each column in: 'LotFrontage'
    df = df.fillna({'LotFrontage': df['LotFrontage'].median()})
    # Change column type to int32 for column: 'LotFrontage'
    df = df.astype({'LotFrontage': 'int32'})
    # One-hot encode column: 'Alley'
    insert_loc = df.columns.get_loc('Alley')
    df = pd.concat([df.iloc[:,:insert_loc], pd.get_dummies(df.loc[:, ['Alley']], dummy_na=True), df.iloc[:,insert_loc+1:]], axis=1)
    df['Alley_Grvl'] = df['Alley_Grvl'].astype(int)
    df['Alley_Pave'] = df['Alley_Grvl'].astype(int)
    df['Alley_nan'] = df['Alley_Grvl'].astype(int)
    # Replace missing values with "None" in column: 'MasVnrType'
    df = df.fillna({'MasVnrType': "None"})
    # Drop rows with missing data in column: 'MasVnrArea'
    df = df.dropna(subset=['MasVnrArea'])
    df['BsmtQual'] = df['BsmtQual'].fillna('NA')
    df['BsmtCond'] = df['BsmtCond'].fillna('NA')
    df['BsmtExposure'] = df['BsmtExposure'].fillna('NA')
    df['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')
    df['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')
    # Drop rows with missing data in column: 'Electrical'
    df = df.dropna(subset=['Electrical'])
    df['FireplaceQu'] = df['FireplaceQu'].fillna('NA')
    # Replace missing values with "NA" in column: 'GarageType'
    df = df.fillna({'GarageType': "NA"})
    # Replace missing values with 0 in column: 'GarageYrBlt'
    df = df.fillna({'GarageYrBlt': 0})
    # Replace missing values with "NA" in column: 'GarageFinish'
    df = df.fillna({'GarageFinish': "NA"})
    # Replace missing values with "NA" in column: 'GarageQual'
    df = df.fillna({'GarageQual': "NA"})
    # Replace missing values with "NA" in column: 'GarageCond'
    df = df.fillna({'GarageCond': "NA"})
    # Replace missing values with "NA" in column: 'PoolQC'
    df = df.fillna({'PoolQC': "NA"})
    # Replace missing values with "NA" in column: 'Fence'
    df = df.fillna({'Fence': "NA"})
    # Replace missing values with "NA" in column: 'MiscFeature'
    df = df.fillna({'MiscFeature': "NA"})
    return df

df = clean_data(df.copy())

In [5]:
df.to_csv("data/train.csv")

## Encoding Data

In [None]:
import pandas as pd

def clean_data(df):
    # One-hot encode column: 'MSZoning'
    insert_loc = df.columns.get_loc('MSZoning')
    df = pd.concat([df.iloc[:,:insert_loc], pd.get_dummies(df.loc[:, ['MSZoning']], dummy_na=True), df.iloc[:,insert_loc+1:]], axis=1)
    # Drop column: 'MSZoning_C (all)'
    df = df.drop(columns=['MSZoning_C (all)'])
    # Change column type to int32 for columns: 'MSZoning_FV', 'MSZoning_RH' and 3 other columns
    df = df.astype({'MSZoning_FV': 'int32', 'MSZoning_RH': 'int32', 'MSZoning_RL': 'int32', 'MSZoning_RM': 'int32', 'MSZoning_nan': 'int32'})
    # One-hot encode column: 'Street'
    insert_loc = df.columns.get_loc('Street')
    df = pd.concat([df.iloc[:,:insert_loc], pd.get_dummies(df.loc[:, ['Street']], dummy_na=True), df.iloc[:,insert_loc+1:]], axis=1)
    # Change column type to int32 for column: 'Street_Pave'
    df = df.astype({'Street_Pave': 'int32'})
    # Change column type to int32 for column: 'Street_Grvl'
    df = df.astype({'Street_Grvl': 'int32'})
    # Drop column: 'Street_nan'
    df = df.drop(columns=['Street_nan'])
    df['LotShape'] = df['LotShape'].map({'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3})
    # One-hot encode columns: 'Utilities', 'LotConfig'
    for column in ['Utilities', 'LotConfig']:
        insert_loc = df.columns.get_loc(column)
        df = pd.concat([df.iloc[:,:insert_loc], pd.get_dummies(df.loc[:, [column]], dummy_na=True), df.iloc[:,insert_loc+1:]], axis=1)
    # Change column type to int32 for column: 'Utilities_AllPub'
    df = df.astype({'Utilities_AllPub': 'int32'})
    # Change column type to int32 for column: 'Utilities_NoSeWa'
    df = df.astype({'Utilities_NoSeWa': 'int32'})
    # Change column type to int32 for column: 'Utilities_nan'
    df = df.astype({'Utilities_nan': 'int32'})
    # Change column type to int32 for column: 'LotConfig_Corner'
    df = df.astype({'LotConfig_Corner': 'int32'})
    # Change column type to object for column: 'LotConfig_CulDSac'
    df = df.astype({'LotConfig_CulDSac': 'int32','LotConfig_FR3': 'int32','LotConfig_Inside': 'int32','LotConfig_nan': 'int32'})
    # Change column type to object for column: 'LotConfig_FR2'
    df = df.astype({'LotConfig_FR2': 'int32'})
    df['LandSlope'] = df['LandSlope'].map({'Sev': 0, 'Mod': 1, 'Gtl': 2})
    df['Neighborhood_grouped'] = df['Neighborhood'].apply(lambda x: x if x in df['Neighborhood'].value_counts().head(10).index else 'Others')
    df = pd.get_dummies(df, columns=['Neighborhood_grouped'], prefix='Neighborhood', dtype=int)
    df = df.drop('Neighborhood', axis=1)
    df['LandContour'] = df['LandContour'].map({'Low': 0, 'HLS': 1, 'Bnk': 2, 'Lvl': 3})
    df = pd.get_dummies(df, columns=['Condition1'], prefix='Condition1', dtype=int)
    df = pd.get_dummies(df, columns=['Condition2'], prefix='Condition2', dtype=int)
    for col in ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe']: df[f'Condition_{col}'] = df.get(f'Condition1_{col}', 0) + df.get(f'Condition2_{col}', 0)
    df = df.drop([col for col in df.columns if col.startswith('Condition1_') or col.startswith('Condition2_')], axis=1)
    # One-hot encode BldgType and HouseStyle, then drop them
    df = pd.get_dummies(df, columns=['BldgType', 'HouseStyle'], drop_first=True)
    df = pd.get_dummies(df, columns=['Exterior1st'], prefix='Exterior1st', dtype=int)
    df = pd.get_dummies(df, columns=['Exterior2nd'], prefix='Exterior2nd', dtype=int)
    for col in ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']: df[f'Exterior_{col}'] = df.get(f'Exterior1st_{col}', 0) + df.get(f'Exterior2nd_{col}', 0)
    df = df.drop([col for col in df.columns if col.startswith('Exterior1st_') or col.startswith('Exterior2nd_')], axis=1)
    df = pd.get_dummies(df, columns=['RoofStyle', 'RoofMatl'], dtype=int)
    df = pd.get_dummies(df, columns=['MasVnrType'], dtype=int)
    df['ExterQual'] = df['ExterQual'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['ExterCond'] = df['ExterCond'].map({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['ExteriorRating'] = df['ExterQual'] + df['ExterCond']
    df = df.drop(['ExterQual', 'ExterCond'], axis=1)
    df['BsmtQual'] = df['BsmtQual'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['BsmtCond'] = df['BsmtCond'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['BsmtQual'] = df['BsmtQual'].fillna(0).astype(int)
    df['BsmtCond'] = df['BsmtCond'].fillna(0).astype(int)
    df['BsmtQual'] = df.apply(lambda row: 0 if row['BsmtCond'] == 0 else row['BsmtQual'], axis=1)
    df['BsmtCond'] = df.apply(lambda row: 0 if row['BsmtQual'] == 0 else row['BsmtCond'], axis=1)
    df['BasementRating'] = df['BsmtQual'] + df['BsmtCond']
    df = df.drop(['BsmtQual', 'BsmtCond'], axis=1)
    df = pd.get_dummies(df, columns=['Foundation'], dtype=int)
    # Drop columns: 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'
    df = df.drop(columns=['BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'])
    # Change column type to int32 for columns: 'BldgType_2fmCon', 'BldgType_Duplex' and 9 other columns
    df = df.astype({'BldgType_2fmCon': 'int32', 'BldgType_Duplex': 'int32', 'BldgType_Twnhs': 'int32', 'BldgType_TwnhsE': 'int32', 'HouseStyle_1.5Unf': 'int32', 'HouseStyle_1Story': 'int32', 'HouseStyle_2.5Fin': 'int32', 'HouseStyle_2.5Unf': 'int32', 'HouseStyle_2Story': 'int32', 'HouseStyle_SFoyer': 'int32', 'HouseStyle_SLvl': 'int32'})
    df = pd.get_dummies(df, columns=['Heating'], dtype=int)
    df['HeatingQC'] = df['HeatingQC'].map({'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
    df['CentralAir'] = df['CentralAir'].map({'N': 0, 'Y': 1})
    # One-hot encode column: 'Electrical'
    insert_loc = df.columns.get_loc('Electrical')
    df = pd.concat([df.iloc[:,:insert_loc], pd.get_dummies(df.loc[:, ['Electrical']], dtype=int), df.iloc[:,insert_loc+1:]], axis=1)
    df['KitchenQual'] = df['KitchenQual'].map({'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
    df['Functional'] = df['Functional'].map({'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7})
    df['FireplaceQu'] = df['FireplaceQu'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df = pd.get_dummies(df, columns=['GarageType'], dtype=int)
    # Change column type to int32 for column: 'GarageYrBlt'
    df = df.astype({'GarageYrBlt': 'int32'})
    df['GarageFinish'] = df['GarageFinish'].map({'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3})
    df['GarageQual'] = df['GarageQual'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['GarageCond'] = df['GarageCond'].map({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['GarageRating'] = df['GarageQual'] + df['GarageCond']
    df = df.drop(['GarageQual', 'GarageCond'], axis=1)
    # Map 'PavedDrive' values and handle NaN
    df['PavedDrive'] = df['PavedDrive'].map({'N': 0, 'Y': 1}).fillna(0).astype(int)
    df['Fence'] = df['Fence'].map({'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4})
    df = pd.get_dummies(df, columns=['MiscFeature'], dtype=int)
    df['PoolQC'] = df['PoolQC'].map({'NA': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
    df = pd.get_dummies(df, columns=['SaleType'], dtype=int)
    df = pd.get_dummies(df, columns=['SaleCondition'], dtype=int)
    # Invert values in 'Alley_nan' column
    df['Alley_nan'] = df['Alley_nan'].apply(lambda x: 1 if x == 0 else 0)
    # Drop column: 'Utilities_nan'
    df = df.drop(columns=['Utilities_nan'])
    # Drop column: 'LotConfig_nan'
    df = df.drop(columns=['LotConfig_nan'])
    # Change column type to int64 for column: 'MasVnrArea'
    df = df.astype({'MasVnrArea': 'int64'})
    return df

df = clean_data(df.copy())

In [None]:
int_cols = df.select_dtypes(include=['int32']).columns
df[int_cols] = df[int_cols].astype('int64')

In [5]:
df.to_csv("data/train.csv")