In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/housing-prices-competition-for-kaggle-learn-users/train.csv
/kaggle/input/housing-prices-competition-for-kaggle-learn-users/test.csv


In [2]:
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)



In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [4]:
#Question 1
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
17837.82570776256


MAE (Drop columns with missing values):
17837.82570776256

In [5]:
#Question 2
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
print("MAE (Impute missing values using median):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Impute missing values using median):
17791.59899543379


MAE (Impute missing values using median):
17791.59899543379

In [6]:
#Question 3
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
for col in cols_with_missing:
    X_train_plus[col + '_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_missing'] = X_valid_plus[col].isnull()
imputer = SimpleImputer(strategy='median')
imputed_X_train_plus = pd.DataFrame(imputer.fit_transform(X_train_plus), columns=X_train_plus.columns)
imputed_X_valid_plus = pd.DataFrame(imputer.transform(X_valid_plus), columns=X_valid_plus.columns)
print("MAE (Add missing value indicators and impute using median):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE (Add missing value indicators and impute using median):
18063.910194063923


MAE (Add missing value indicators and impute using median):
18063.910194063923

In [7]:
#Question 4

cols_with_missing = [col for col in X.columns if X[col].isnull().any()]

reduced_X = X.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)

imputer = SimpleImputer(strategy='median')
reduced_imputed_X = pd.DataFrame(imputer.fit_transform(reduced_X))
reduced_imputed_X_test = pd.DataFrame(imputer.transform(reduced_X_test))

reduced_imputed_X.columns = reduced_X.columns
reduced_imputed_X_test.columns = reduced_X_test.columns

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(reduced_imputed_X, y)

preds_test = model.predict(reduced_imputed_X_test)

output = pd.DataFrame({'Id': reduced_X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission_drop_columns.csv', index=False)

Score: 16381.48041

In [8]:
#Question 5
imputer = SimpleImputer(strategy='median')
imputed_X = pd.DataFrame(imputer.fit_transform(X))
imputed_X_test = pd.DataFrame(imputer.transform(X_test))
imputed_X.columns = X.columns
imputed_X_test.columns = X_test.columns
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)
rf_model.fit(imputed_X, y)
predictions = rf_model.predict(imputed_X_test)
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('submission_impute_median.csv', index=False)

Score: 16452.43726

In [9]:
#Question 6
X_plus = X.copy()
X_test_plus=X_test.copy()

for col in cols_with_missing:
    X_plus[col + '_was_missing'] = X_plus[col].isnull()
    X_test_plus[col + '_was_missing'] = X_test_plus[col].isnull()

my_imputer = SimpleImputer(strategy="median")
imputed_X_plus = pd.DataFrame(my_imputer.fit_transform(X_plus))
imputed_X_test_plus = pd.DataFrame(my_imputer.transform(X_test_plus))

imputed_X_plus.columns = X_plus.columns
imputed_X_test_plus.columns = X_test_plus.columns

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(imputed_X_plus,y)
predictions = model.predict(imputed_X_test_plus)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predictions})
output.to_csv('submission_impute_plus_median.csv', index=False)


Score: 16451.15081