In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../input/melbourne-housing/melb_data.csv')
df.describe()

y = df.Price
X = df.drop(['Price'], axis=1)

train_x, validate_x, train_y, validate_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [10]:
col_with_missing = [c for c in train_x.columns if train_x[c].isnull().any()]

train_x.drop(col_with_missing, axis=1, inplace=True)
validate_x.drop(col_with_missing, axis=1, inplace=True)

In [14]:
low_cardinality_cols = [c for c in train_x.columns if train_x[c].nunique() <= 10 and train_x[c].dtype == 'object']
numeric_cols = [c for c in train_x.columns if train_x[c].dtype in ['int64', 'float64']]
final_cols = low_cardinality_cols + numeric_cols
final_cols

train_x_final = train_x[final_cols].copy()
validate_x_final = validate_x[final_cols].copy()

In [15]:
low_cardinality_cols

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [17]:
train_x_final.dtypes

Type              object
Method            object
Regionname        object
Rooms              int64
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Landsize         float64
Lattitude        float64
Longtitude       float64
Propertycount    float64
dtype: object

In [23]:
s = (train_x_final.dtypes == 'object')
obj_cols = list(s[s].index)

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score(train_x, train_y, validate_x, validate_y):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(train_x, train_y)
    predictions = model.predict(validate_x)
    return mean_absolute_error(validate_y, predictions)

In [26]:
drop_train_x = train_x_final.drop(obj_cols, axis=1)
drop_validate_x = validate_x_final.drop(obj_cols, axis=1)

print("Dropping columns and testing")
print(score(drop_train_x, train_y, drop_validate_x, validate_y))

Dropping columns and testing
175703.48185157913


In [27]:
from sklearn.preprocessing import LabelEncoder

label_train_x = train_x_final.copy()
label_validate_x = validate_x_final.copy()

label_encoder = LabelEncoder()
for c in obj_cols:
    label_train_x[c] = label_encoder.fit_transform(train_x_final[c])
    label_validate_x[c] = label_encoder.transform(validate_x_final[c])
    
print("Labeling object columns")
print(score(label_train_x, train_y, label_validate_x, validate_y))

Labeling object columns
165936.40548390493


In [31]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

oh_train_x = pd.DataFrame(oh_encoder.fit_transform(train_x_final[obj_cols]))
oh_validate_x = pd.DataFrame(oh_encoder.transform(validate_x_final[obj_cols]))

oh_train_x.index = train_x_final.index
oh_validate_x.index = validate_x_final.index

temp_train_x = train_x_final.drop(obj_cols, axis=1)
temp_validate_x = validate_x_final.drop(obj_cols, axis=1)

train_x_with_oh = pd.concat([temp_train_x, oh_train_x], axis=1)
validate_x_with_oh = pd.concat([temp_validate_x, oh_validate_x], axis=1)

print("OneHotEncoder object columns")
print(score(train_x_with_oh, train_y, validate_x_with_oh, validate_y))

OneHotEncoder object columns
166089.4893009678
