In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
path = "./archive/melb_data.csv"
df = pd.read_csv(path)

X = df.drop(["Price"], axis=1)
y = df["Price"]
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)

categorical_cols = X_train_full.select_dtypes(include=["object"])
categorical_cols = [cname for cname in categorical_cols if X_train_full[cname].nunique()<10]
numerical_cols = X_train_full.select_dtypes(include=["int64", "float64"])
numerical_cols = [cname for cname in numerical_cols]

# keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [10]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12796,h,S,Eastern Metropolitan,4,14.2,3149.0,4.0,2.0,2.0,695.0,160.0,1970.0,-37.86127,145.14271,13366.0
9642,h,S,Eastern Metropolitan,3,14.2,3149.0,3.0,1.0,2.0,810.0,,,-37.86838,145.14664,13366.0
3207,u,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,1.0,82.0,,,-37.818,145.0268,11308.0
1698,u,S,Northern Metropolitan,2,3.2,3054.0,2.0,1.0,1.0,0.0,76.0,1975.0,-37.7902,144.97,3106.0
761,h,S,Southern Metropolitan,4,13.0,3204.0,4.0,2.0,1.0,292.0,,,-37.9148,145.0243,6795.0


## Step 1: Define Preprocessing Steps
- Use ColumnTransformer class to bundle together different preprocessing step.
  - Impute missing values in numerical data
  - Impute missing values and applies one-hot-encoding to categorical data

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="constant")

# preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])


## Define Model
- Create a random forest model

In [12]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)

## Create & Evaluate Pipeline
- With the pipeline, we preprocess the training data and fit the model in a single line of code. (In contrast, without a pipeline, we have to do imputation, one-hot encoding, and model training in separate steps. This becomes especially messy if we have to deal with both numerical and categorical variables!)
- With the pipeline, we supply the unprocessed features in X_valid to the predict() command, and the pipeline automatically preprocesses the features before generating predictions. (However, without a pipeline, we have to remember to preprocess the validation data before making predictions.)

In [None]:
from sklearn.metircs import mean_absolute_error

# bundle preprocessing and modeling code in pipeline
my_pipeline = Pipeline(steps=["preprocessor", preprocessor),
                              ("model", model)
                             ])

# preprocessing of training data and fit model
my_pipeline.fit(X_train, y_train)

# preprocessing of validation data and get predictions
preds = my_pipeline.predict(X_valid)

# evalu