Pipeline Implementation

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
steps= [("standard scaler", StandardScaler()),
       ("classifer", LogisticRegression())]

In [3]:
steps

[('standard scaler', StandardScaler()), ('classifer', LogisticRegression())]

In [4]:
pipe= Pipeline(steps)
pipe

In [5]:
from sklearn import set_config
set_config(display='diagram')

In [6]:
pipe

In [7]:
from sklearn.datasets import make_classification
x, y= make_classification(n_samples=1000)

In [8]:
x.shape

(1000, 20)

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
x_train.shape

(800, 20)

In [11]:
pipe.fit(x_train, y_train)

In [12]:
pipe.predict(x_test[:5])

array([1, 0, 0, 1, 0])

In [13]:
y_test[:5]

array([1, 0, 0, 1, 0])

In [14]:
pipe.score(x_test, y_test)

0.915

Displaying a pipeline with standard scaler, dimensionality reduction and estimator

In [15]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [16]:
steps= [("scaling", StandardScaler()),
       ("PCA", PCA(n_components=3)),
       ("SVC", SVC())]

In [17]:
pipes= Pipeline(steps)
pipes

In [18]:
pipes['scaling'].fit_transform(x_train)

array([[ 0.9080621 ,  1.30761044, -0.45139942, ...,  1.14008211,
        -0.43256049,  1.12409123],
       [-0.51242105, -0.55430042,  0.86628548, ..., -0.23158571,
        -1.19877112,  2.04490877],
       [-2.08044803, -0.03033317,  1.06856085, ..., -1.72210714,
        -0.06814326, -1.79012069],
       ...,
       [-1.1315219 ,  1.62389716,  0.15286269, ..., -0.81152976,
        -0.16266323,  0.05047587],
       [ 1.06240672, -0.28550869,  1.12314851, ...,  1.52419792,
         1.49926546,  0.04558354],
       [-1.69995252, -2.17358156,  0.55990215, ..., -1.47275267,
        -1.35818769, -1.03379401]])

In [19]:
pipes.fit(x_train, y_train)

In [20]:
pipes.predict(x_test[:5])

array([1, 0, 0, 1, 1])

In [21]:
y_test[:5]

array([1, 0, 0, 1, 0])

In [22]:
pipes.score(x_test, y_test)

0.9

Applying column transformer

In [23]:
from sklearn.impute import SimpleImputer

In [24]:
## numerical processing pipeline
import numpy as np
numeric_processor= Pipeline(
steps= [("Impute_mean", SimpleImputer(missing_values=np.nan, strategy='mean')),
       ("scaler", StandardScaler()),
       ]
)

In [25]:
numeric_processor

In [26]:
## categorical processing pipeline
from sklearn.preprocessing import OneHotEncoder
categorical_processor= Pipeline(
steps= [("Impute_const", SimpleImputer(fill_value='None', strategy='constant')),
       ("onehot", OneHotEncoder(handle_unknown='ignore')),
       ]
)

In [27]:
categorical_processor

In [28]:
## combining the processing techniques by combining ntwo pipelines
from sklearn.compose import ColumnTransformer

In [29]:
preprocessor= ColumnTransformer(
[("categorical", categorical_processor, ['gender', 'City']),
("numerical", numeric_processor, ['age', 'height'])]
)

In [30]:
preprocessor

In [31]:
from sklearn.pipeline import make_pipeline

In [32]:
f_pipe= make_pipeline(preprocessor, LogisticRegression())

In [33]:
f_pipe