## Scikit-learn pipelines

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

### Linear regression

In [2]:
data = pd.read_csv('income_data.csv')

In [3]:
X = data.drop("Income", axis=1)
y = data["Income"]

In [4]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mode", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ("scaler", StandardScaler()),
    ]
)

In [5]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_most_frequent",
            SimpleImputer(fill_value="missing", strategy="most_frequent"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [6]:
preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, ["Residence"]),
        ("numerical", numeric_preprocessor, ["Age", "Education"]),
    ]
)

In [7]:
pipe = make_pipeline(preprocessor, LinearRegression())
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputation_most_frequent',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Residence']),
                                                 ('numerical',
                                                  Pipeline(steps=[('imputation_mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
  

#### Let's try the pipeline

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Make predictions on the test data
predictions = pipe.predict(X_test)

In [9]:
predictions

array([ 44.95786203,  93.84557872,  68.96051507, 100.84674326,
        81.72148509,  89.88273781,  34.51706837,  65.01828657,
       106.02650466,  56.28714862,  83.47177623,  53.41740473,
        55.14737815, 105.44646057,  55.70710453,  64.43824248,
        77.11160893, 106.56591332,  65.05892201,  78.31866526,
        34.51706837,  78.31233254,  73.67197981,  79.4317853 ,
        72.47125619,  63.23751887,  44.33718251,  89.87257896,
        39.69682977,  68.49855113,  82.37264119,  41.43696205,
        52.25731655,  51.71790789,  56.90782814,  93.86589644,
        88.17308211,  83.54288823,  60.41856927,  79.4317853 ,
        45.49727069,  49.98793447,  70.15107983,  75.34100008,
        67.2914948 ,  59.17721022,  75.9616796 ,  88.03497895,
        89.91321439,  84.11277347,  54.56733406,  70.1104444 ,
       102.58687554,  66.75841885, 105.43630171, 102.00683144,
        60.97829565, 100.82642554,  54.5571752 ,  91.56603779,
        81.77227938,  54.00760768, 104.3066901 , 107.75

In [10]:
# evaluating the model performance
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)
print("Cross-validation accuracy:", cv_scores.mean())

Cross-validation accuracy: 0.7931547125497007


### Classification

In [11]:
# Load and prepare the data
data = load_iris(as_frame=True)
X = data.data
y = data.target

In [12]:
numeric_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_preprocessor, X.columns)]
)

In [14]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [15]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object'))])),
                ('classifier', LogisticRegression())])

In [17]:
# accuracy score
accuracy_score(pipeline.predict(X_test),y_test)

1.0