# Sklearn pipelines

The pipeline steps are executed one after the others.

The name should be unique because we can cache it which is advantageous when fitting is time consuming.

The last step executed should be the estimator.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
def get_data():
    data = pd.read_csv('data/melb_data.csv')
    y = data.Price
    X = data.drop(['Price'], axis=1)
    
    categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == 'object']
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    filtered_cols = categorical_cols + numerical_cols
    
    return X[filtered_cols], y, categorical_cols, numerical_cols

In [3]:
X, y, categorical_cols, numerical_cols = get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
6120,h,VB,Southern Metropolitan,3,11.2,3127.0,3.0,1.0,1.0,763.0,180.0,1930.0,-37.8148,145.0965,5457.0
8779,h,PI,Northern Metropolitan,4,20.6,3064.0,4.0,2.0,2.0,756.0,236.0,2007.0,-37.58885,144.90135,15510.0
10556,h,S,Northern Metropolitan,5,16.5,3049.0,5.0,2.0,2.0,626.0,,,-37.67688,144.88996,2474.0
8504,u,PI,Western Metropolitan,1,8.2,3012.0,1.0,1.0,1.0,0.0,,,-37.798,144.8672,5058.0
13178,h,S,Northern Metropolitan,2,3.4,3031.0,2.0,1.0,1.0,193.0,,,-37.78749,144.93203,3593.0


In [4]:
numerical_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='constant')),
    ('num_scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [5]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [6]:
model_pipeline = Pipeline(steps=[
    ('preeprocessor', preprocessor),
    ('model', model)
])
model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preeprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('num_scaler',
                                                                   StandardScaler())]),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
                                                   'Landsize', 'BuildingArea',
                                                   'YearBuilt', 'Lattitude',
                                                   'Longtitude',
                                                   'Propertycount']),
                                              

In [7]:
model_pipeline.score(X_test, y_test)

0.7704886350707709

In [8]:
preds = model_pipeline.predict(X_test)
mean_absolute_error(y_test, preds)

165831.50440367486