In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

In [2]:
# Import the Data

melb = pd.read_csv('melb_data.csv')

In [3]:
# Separate the Target from the Features

y = melb.Price

X = melb.drop(['Price'], axis=1)

In [4]:
# A list of numerical columns with no few missing entries

numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] and X[col].isnull().sum() < 100]

In [5]:
# A list of categorical columns with low cardinality 

categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

In [6]:
# Define the column transformers <The preprocessing steps>

# The Numerical Transformer to impute the missing entries

numerical_transformer = SimpleImputer(strategy='median')


# The Categorical Transformer for One Hot Encoding

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [7]:
# The preprocessor for transforming all the columns
# This performs all the preprocessing steps

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols), 
                                               ('cat', categorical_transformer, categorical_cols)])

In [8]:
# The Model

model = RandomForestRegressor(random_state=0)

In [9]:
# The pipeline that bundles together preprocessing and modelling

the_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [10]:
# Split the training and validation sets

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=7)

In [11]:
# Fit the_pipeline with the unpreprocessed features

the_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
   

In [12]:
# Making predictions

predictions = the_pipeline.predict(X_valid)

In [13]:
# Evaluating the model's performance

mae = mean_absolute_error(predictions, y_valid)

mae

169059.8916520268