In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

# import data
melbourne_data = pd.read_csv("melb_data.csv")
melbourne_data.info()

price_data = melbourne_data.Price
features_data = melbourne_data.drop(['Price'], axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
Suburb           13580 non-null object
Address          13580 non-null object
Rooms            13580 non-null int64
Type             13580 non-null object
Price            13580 non-null float64
Method           13580 non-null object
SellerG          13580 non-null object
Date             13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null float64
Bedroom2         13580 non-null float64
Bathroom         13580 non-null float64
Car              13518 non-null float64
Landsize         13580 non-null float64
BuildingArea     7130 non-null float64
YearBuilt        8205 non-null float64
CouncilArea      12211 non-null object
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null float64
dtypes: float64(12), int64(1), object(8)
memory usage: 2.2+ MB


In [10]:
# print(features_data.dtypes)
# select only numeric features
numeric_features_data = features_data.select_dtypes(exclude=['object'])
train_features, validation_features, train_price, validation_price = train_test_split(numeric_features_data, price_data, random_state = 0)

In [16]:
def score_dataset(train_features, validation_features, train_price, validation_price):
    forest_model = RandomForestRegressor()
    forest_model.fit(train_features, train_price)
    forest_prediction = forest_model.predict(validation_features)
    mae = mean_absolute_error(validation_price, forest_prediction)
    return(mae)

In [17]:
# model based on dropping columns with missing values
features_with_missing = [col for col in train_features.columns if train_features[col].isnull().any()]
reduced_train_features = train_features.drop(features_with_missing, axis=1)
reduced_validation_features  = validation_features.drop(features_with_missing, axis=1)
drop_col_mae = score_dataset(reduced_train_features, reduced_validation_features, train_price, validation_price)
print drop_col_mae

185379.83504874114


In [20]:
# model based on imputation
my_imputer = Imputer()
imputed_train_features = my_imputer.fit_transform(train_features)
imputed_validation_features = my_imputer.transform(validation_features)
imputed_mae = score_dataset(imputed_train_features, imputed_validation_features, train_price, validation_price)
print imputed_mae

184047.36780559647


In [23]:
# model based on imputation with extra columns showing what was imputed
imputed_train_features_plus = train_features.copy()
imputed_validation_features_plus = validation_features.copy()

for col in features_with_missing:
    imputed_train_features_plus[col + '_was_missing'] = imputed_train_features_plus[col].isnull()
    imputed_validation_features_plus[col + '_was_missing'] = imputed_validation_featurest_plus[col].isnull()

imputed_train_features_plus = my_imputer.fit_transform(imputed_train_features_plus)
imputed_validation_features_plus = my_imputer.transform(imputed_validation_features_plus)
    
imputed_plus_mae = score_dataset(imputed_train_features_plus, imputed_validation_features_plus, train_price, validation_price)
print imputed_plus_mae

179156.85753559152


In [29]:
features_data.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [31]:
# one-hot encodings
one_hot_features_data = pd.get_dummies(features_data)

In [None]:
def get_mae(features, price):
    neg_mean_absolute_error = cross_val_score(RandomForestRegressor(50), features, price, scoring = 'neg_mean_absolute_error').mean()
    return -1 * neg_mean_absolute_error

mae_numerical_features_only = get_mae(numeric_features_data, price_data)
mae_one_hot_encoded = get_mae(one_hot_features_data, price_data)

print(mae_numerical_features_only)
print(mae_one_hot_encoded)

In [None]:
# Ensure the test data is encoded in the same manner as the training data with the align command
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors, join='left', axis=1)
