In [2]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

In [3]:
train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'
train_data = pd.read_csv(train_data_path)
test_data =  pd.read_csv(test_data_path)

In [7]:
# drop houses where the target is missing
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

target = train_data.SalePrice

cols_with_missing = [col for col in train_data.columns if train_data[col].isnull().any()]

candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)

candidate_test_predictors = test_data.drop(['Id']+cols_with_missing, axis=1)

low_cardinality_cols = [cname for cname in candidate_train_predictors.columns 
                        if candidate_train_predictors[cname].nunique()<10 and 
                       candidate_train_predictors[cname].dtype=='object']
numeric_cols = [cname for cname in candidate_train_predictors.columns
               if candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [8]:
train_predictors.dtypes.sample(10)

LotArea           int64
Functional       object
Heating          object
BedroomAbvGr      int64
ScreenPorch       int64
OpenPorchSF       int64
OverallQual       int64
SaleCondition    object
LowQualFinSF      int64
ExterCond        object
dtype: object

In [10]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
# one_hot_encoded_training_predictors

In [13]:
from sklearn.model_selection import cross_val_score

In [16]:
# multiple by -1 to make positive MAE score instead of neg value returned as sklearn convertion
def get_mae(X, y):
    return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring='neg_mean_absolute_error').mean()
predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])
mae_without_categoricals = get_mae(predictors_without_categoricals, target)
mae_one_hot_encoded =  get_mae(one_hot_encoded_training_predictors, target)
print("Mean Absolute Error when Droppping Categoricals:"+ str(int(mae_without_categoricals)))
print("Mean Absolute Error with One_Hot_Encoding :"+ str(int(mae_one_hot_encoded)))

Mean Absolute Error when Droppping Categoricals:18516
Mean Absolute Error with One_Hot_Encoding :18343


In [None]:
def 