![Ames Housing dataset image](https://i.imgur.com/lTJVG4e.png)

# Imports

In [2]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Read Data and Set Dependent & Independent Variables

In [3]:
iowa_file_path = "01_input/train.csv"
home_data = pd.read_csv(iowa_file_path)

In [4]:
home_data.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = home_data.SalePrice

In [5]:
X = home_data.drop(columns=["SalePrice"]).select_dtypes(exclude=["object"])

# Split Data to Train and Test Set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        train_size=0.8, test_size=0.2, random_state=1)

# Preliminary Investigation

In [10]:
print(X_train.shape)

na_count_by_columns = X_train.isna().sum()
print(na_count_by_columns[na_count_by_columns > 0])

(1168, 37)
LotFrontage    209
MasVnrArea       8
GarageYrBlt     61
dtype: int64


# Random Forest Model

In [23]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [25]:
def score_dataset(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

# Drop Columns with Missing Values

In [24]:
cols_with_na = na_count_by_columns[na_count_by_columns > 0].index.tolist()
print(cols_with_na)

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


In [20]:
X_train_reduced = X_train.drop(cols_with_na, axis=1)
X_test_reduced = X_test.drop(cols_with_na, axis=1)

In [26]:
print("MAE (Drop columns with missing values:)", 
      score_dataset(X_train_reduced, X_test_reduced))

MAE (Drop columns with missing values:) 16600.972842465755


# Imputation
## Using mean

In [28]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()

In [30]:
X_train_imputed = pd.DataFrame(my_imputer.fit_transform(X_train), 
                              columns=X_train.columns)
X_test_imputed = pd.DataFrame(my_imputer.transform(X_test), 
                             columns=X_test.columns)

In [33]:
print("MAE (Imputation):", score_dataset(X_train_imputed, X_test_imputed))

MAE (Imputation): 16638.533150684932


## Using median

In [36]:
my_imputer = SimpleImputer(strategy="median")

X_train_imputed = pd.DataFrame(my_imputer.fit_transform(X_train),
                              columns=X_train.columns)
X_test_imputed = pd.DataFrame(my_imputer.transform(X_test),
                             columns=X_test.columns)

In [37]:
print("MAE (Imputation):", score_dataset(X_train_imputed, X_test_imputed))

MAE (Imputation): 16649.090719178083


# Generate Test Predictions
We will be using the 'Dropping' method since it slightly yields better results.