# Constructing a Simple Pipeline with a Preprocessing Step and Classifier

In this step, we will construct a simple pipeline with a preprocessing step and a classifier, and display its visual representation.

First, we import the necessary modules:

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import set_config

Next, we define the steps of the pipeline:

In [3]:
steps = [
    ('preprocessing', StandardScaler()),
    ('classifier', LogisticRegression())
]

Then, we create the pipeline:

In [4]:
pipe = Pipeline(steps=steps)

Finally, we display the visual representation of the pipeline:

In [5]:
set_config(display='diagram')
pipe

0,1,2
,steps,"[('preprocessing', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


# Constructing a Pipeline Chaining Multiple Preprocessing Steps & Classifier

In this step, we will construct a pipeline with multiple preprocessing steps and a classifier, and display its visual representation.

First, we import the necessary modules:

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression

Next, we define the steps of the pipeline:

In [7]:
steps = [
    ('standard_scaler', StandardScaler()),
    ('polynomial', PolynomialFeatures(degree=3)),
    ('classifier', LogisticRegression(C=2.0))
]

Then, we create the pipeline:

In [8]:
pipe = Pipeline(steps=steps)

Finally, we display the visual representation of the pipeline:

In [9]:
pipe

0,1,2
,steps,"[('standard_scaler', ...), ('polynomial', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,degree,3
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,2.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


# Constructing a Pipeline with Dimensionality Reduction and Classifier

In this step, we will construct a pipeline with a dimensionality reduction step and a classifier, and display its visual representation.

First, we import the necessary modules:

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

Next, we define the steps of the pipeline:

In [11]:
steps = [
    ('reduce_dim', PCA(n_components=4)),
    ('classifier', SVC(kernel='linear'))
]

Then, we create the pipeline:

In [12]:
pipe = Pipeline(steps=steps)

Finally, we display the visual representation of the pipeline:

In [13]:
pipe

0,1,2
,steps,"[('reduce_dim', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,n_components,4
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


# Constructing a Complex Pipeline Chaining a Column Transformer

In this step, we will construct a complex pipeline with a column transformer and a classifier, and display its visual representation.

First, we import the necessary module

In [14]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

Next, we define the preprocessing steps for the numerical and categorical features:

In [15]:
numeric_preprocessor = Pipeline(
    steps=[
        ('imputation_mean', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('scaler', StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        (
            'imputation_constant',
            SimpleImputer(fill_value='missing', strategy='constant'),
        ),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ]
)

Then, we create the column transformer:

In [16]:
preprocessor = ColumnTransformer(
    [
        ('categorical', categorical_preprocessor, ['state', 'gender']),
        ('numerical', numeric_preprocessor, ['age', 'weight']),
    ]
)

Next, we create the pipeline:

In [17]:
pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

Finally, we display the visual representation of the pipeline:

In [18]:
pipe

0,1,2
,steps,"[('columntransformer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500


# Constructing a Grid Search over a Pipeline with a Classifier

In this step, we will construct a grid search over a pipeline with a classifier, and display its visual representation.

First, we import the necessary modules:

In [19]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

Next, we define the preprocessing steps for the numerical and categorical features:

In [20]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

Then, we create the column transformer:

In [21]:
preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, ["state", "gender"]),
        ("numerical", numeric_preprocessor, ["age", "weight"]),
    ]
)

Next, we create the pipeline

In [22]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)

Then, we define the parameter grid for the grid search:

In [23]:
param_grid = {
    "classifier__n_estimators": [200, 500],
    "classifier__max_features": ["auto", "sqrt", "log2"],
    "classifier__max_depth": [4, 5, 6, 7, 8],
    "classifier__criterion": ["gini", "entropy"],
}

Finally, we create the grid search:

In [24]:
grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=1)

And display the visual representation of the grid search:

In [25]:
grid_search

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'classifier__criterion': ['gini', 'entropy'], 'classifier__max_depth': [4, 5, ...], 'classifier__max_features': ['auto', 'sqrt', ...], 'classifier__n_estimators': [200, 500]}"
,scoring,
,n_jobs,1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Summary

This lab provided a step-by-step guide on how to construct and display pipelines in Scikit-Learn. We covered simple pipelines with a preprocessing step and classifier, pipelines chaining multiple preprocessing steps and a classifier, pipelines with dimensionality reduction and a classifier, complex pipelines chaining a column transformer and a classifier, and grid searches over pipelines with a classifier.