In [105]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

### Function to calculate Mean Absolute Error

In [106]:
def calculate_mae(Xtrain, ytrain, Xtest, ytest):
    model = RandomForestRegressor()
    model.fit(Xtrain, ytrain)
    predictions = model.predict(Xtest)
    return model, mean_absolute_error(ytest, predictions)

### Setup

In [107]:
data = pd.read_csv('./train.csv')

X = data.select_dtypes(exclude=['object']).drop(['SalePrice'], axis=1)
y = data.SalePrice

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.20)

### Test dropping columns

In [108]:
na_cols = [col for col in X.columns if X[col].isna().sum()]

In [109]:
reduced_Xtrain = Xtrain.drop(na_cols, axis=1)
reduced_Xtest = Xtest.drop(na_cols, axis=1)

In [110]:
reduced_model, mae = calculate_mae(reduced_Xtrain, ytrain, reduced_Xtest, ytest)
mae

19829.45890410959

### Test imputing rows

In [111]:
imputer = Imputer()
imputed_Xtrain = imputer.fit_transform(Xtrain)
imputed_Xtest = imputer.fit_transform(Xtest)
imputed_model, mae = calculate_mae(imputed_Xtrain, ytrain, imputed_Xtest, ytest)
mae

19747.848287671233

## Submission 2

In [121]:
test_data = pd.read_csv('./test.csv')
imputer = Imputer()
submission_X = imputer.fit_transform(X)

In [123]:
model = RandomForestRegressor()
model.fit(submission_X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [127]:
sub_Xtest = test_data.select_dtypes(exclude=['object'])

In [129]:
sub_Xtest = test_data.select_dtypes(exclude=['object'])
sub_Xtest = imputer.fit_transform(sub_Xtest)
predicted_prices = model.predict(sub_Xtest)
submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predicted_prices})
submission.to_csv('submission2.csv', index=False)