In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# modelling
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [2]:

full_df = pd.read_csv('./data/train.csv', index_col="Id")
X_test_full = pd.read_csv('./data/test.csv', index_col="Id")

# dropping rows with missing target 
full_df.dropna(axis=0, subset=['SalePrice'], inplace=True)
# seperate target from features 
y = full_df.SalePrice
full_df.drop('SalePrice', axis=1, inplace = True)

# using only numerical columns by removing columns with object type
X = full_df.select_dtypes(exclude='object')
X_test = X_test_full.select_dtypes(exclude='object')

# data split with 80% training 
X_train, X_val, y_train, y_val = train_test_split(X,y, random_state=1, test_size= 0.2)


In [3]:
# explort train data
X_train.describe()
print(X_train.columns)

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


In [4]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(1168, 36)
(292, 36)
(1459, 36)


In [5]:
# missing value by column
num_missing_val_column = (X_train.isnull().sum())
print(num_missing_val_column[num_missing_val_column > 0])





LotFrontage    209
MasVnrArea       8
GarageYrBlt     61
dtype: int64


In [15]:
def model_eval(X_t=X_train,X_v=X_val,y_t=y_train,y_v=y_val):
    """function to evaluate model's mae score   

    Args:
        model ([type]): [description]
        X_t ([type], optional): [training features]. Defaults to X_train.
        X_v ([type], optional): [training target]. Defaults to X_val.
        y_t ([type], optional): [validation features]. Defaults to y_train.
        y_v ([type], optional): [validation features]. Defaults to y_val.
    """
    model = RandomForestRegressor(n_estimators=100, random_state=1)
    model.fit(X_t, y_t)
    predictions = model.predict(X_v)
    return mean_absolute_error(y_v, predictions)

    
    

# Approach 1: Dropping Missing Values

In [21]:

cols_with_na = [col for col in X_train.columns if X_train[col].isna().any()]
print(cols_with_na)
# removing na cols from train and validation sets 
reduced_X_train = X_train.drop(cols_with_na, axis=1)
reduced_X_val = X_val.drop(cols_with_na, axis=1)

reduced_X_train.shape

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


(1168, 33)

In [30]:
na_dropped_mae = model_eval(reduced_X_train, reduced_X_val, y_train, y_val)
print(f"When the columns with null values are dropped, MAE is: {na_dropped_mae:.2f}")


When the columns with null values are dropped, MAE is: 16680.64


# Approach 2: Imputation

In [31]:
imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train), columns = X_train.columns)
imputed_X_val = pd.DataFrame(imputer.transform(X_val), columns = X_val.columns)

imputed_X_train.shape

(1168, 36)

In [32]:
imputed_mae = model_eval(imputed_X_train, imputed_X_val, y_train, y_val)
imputed_mae

16879.41023972603