In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../input/melbourne-housing/melb_data.csv")

y = df.Price
X = df.drop(['Price'], axis=1)

train_x, validate_x, train_y, validate_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)


In [4]:
train_x.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [10]:
numeric_cols = [c for c in train_x.columns if train_x[c].dtype in ('float64', 'int64')]
categorical_cols = [c for c in train_x.columns if train_x[c].nunique() < 10 and train_x[c].dtype == 'object']

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = SimpleImputer(strategy='constant')
categorical_transfomer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num_cols', numeric_transformer, numeric_cols),
    ('cat_cols', categorical_transfomer, categorical_cols)
])

In [14]:
from sklearn.ensemble import RandomForestRegressor

final_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=0))
])

final_pipeline.fit(train_x, train_y)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_cols',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car'

In [15]:
from sklearn.metrics import mean_absolute_error

predicted = final_pipeline.predict(validate_x)

print(mean_absolute_error(predicted, validate_y))

156312.91707447925


In [19]:
from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(final_pipeline, X, y, cv = 5, scoring='neg_mean_absolute_error')

scores

array([207273.036228  , 195544.72890525, 186907.59467108, 152084.99219493,
       158236.49133232])

In [20]:
scores.mean()

180009.368666316