In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

In [2]:
melb = pd.read_csv('melb_data.csv')

melb.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [3]:
# Target and features
y = melb.Price

X = melb.drop(['Price'], axis=1)

In [4]:
# A list of the numerical variables with missing values < 100
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] and X[col].isnull().sum() < 100]

numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [5]:
# A list of categorical variables with unique values < 10
categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

categorical_cols

['Type', 'Method', 'Regionname']

In [6]:
# The imputation to use on the numerial values
numerical_transformer = SimpleImputer()

# The encoding to use for the categorical columns
# OneHotEncoding..
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [7]:
# The preprocessor to bundle all the preprocessing steps
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols), 
                                              ('cat', categorical_transformer, categorical_cols)])

In [8]:
# The model
model = DecisionTreeRegressor(random_state=0)

# The pipelien that bundles the preprocessing and modelling steps
# It first preprocesses the data
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [9]:
# Split the unpreprocessed features
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
# Fit the pipeline 
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
     

In [11]:
# Make predictions and evaluate the model

predictions = my_pipeline.predict(X_valid)

mae = mean_absolute_error(predictions, y_valid)

mae

228636.7260677467