# Scikit-Learn: Transformers and Pipelines

In [None]:
# Import libraries
%matplotlib inline
import pandas as pd
import numpy as np

In [None]:
# Import cars.csv
cars = pd.read_csv('data/cars.csv')
cars.head()

**Question 1** Using `Binarizer`, transform the `city-mpg` and `highway-mpg` column to 0 if the mpg is less than or equal to 25 and 1 if it's greater than 25.

In [None]:
# Import Binarizer
from sklearn.preprocessing import Binarizer

In [None]:
# Select columns
X = cars[['city-mpg', 'highway-mpg']]

# Initialize Binarizer
binarizer = Binarizer(threshold=25)

# Fit
# binarizer.fit(X)

# Transform
binarizer.transform(X)

**Question 2** Using `FunctionTransformer`, transform the `city-mpg` and `highway-mpg` columns to a log-scale.

In [None]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

In [None]:
# Select columns
X = cars[['city-mpg', 'highway-mpg']]

# Initialize FunctionTransformer
transformer = FunctionTransformer(np.log)

# Fit
# transformer.fit(X)

# Transform
transformer.transform(X)

**Question 3** Using `OneHotEncoder`, one-hot encode the `body-style` column.

In [None]:
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Select column
X = cars[['body-style']]

# Initialize OneHotEncoder
ohe = OneHotEncoder() # handle_unknown='ignore'

# Fit
ohe.fit(X)

# Transform
ohe.transform(X)

In [None]:
# View output as array
ohe.transform(X).toarray()

In [None]:
# View names of one-hot encoded columns
ohe.get_feature_names_out()
# ohe.categories_

**Question 4** Using `ColumnTransformer`, create a transformer pipeline for the following transformations:

- Transform quant features `wheel-base`, `length` and `width` using standard scaling
- Transform ordinal features `num-of-doors` using ordinal encoder (or you can also perform replacement `{'two': 2, 'four': 4}`)
- Transform nominal features `body-style`, `drive-wheels`, `engine-location`, `fuel-type` using one-hot encoder

In [None]:
# Import classes
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# List columns
quantitative_cols = ['wheel-base', 'length', 'width']
ordinal_cols = ['num-of-doors']
nominal_cols = ['body-style', 'drive-wheels', 'engine-location', 'fuel-type']

In [None]:
# Define custom ordinal encoder
def str_to_int(s):
    return s.replace({'two': 2, 'four': 4})

In [None]:
# Initialize Pipelines
quantitative_pipeline = Pipeline([('std', StandardScaler())])
ordinal_pipeline = Pipeline([('ord', OrdinalEncoder(categories=[['two','four']]))]) # FunctionTransformer(str_to_int)
nominal_pipeline = Pipeline([('ohe', OneHotEncoder())])

In [None]:
# Initialize ColumnTransformer
feature_eng_pipeline = ColumnTransformer([
    ('quant', quantitative_pipeline, quantitative_cols),
    ('ordinal', ordinal_pipeline, ordinal_cols),
    ('nominal', nominal_pipeline, nominal_cols)
])

In [None]:
# Fit and transform
feature_eng_pipeline.fit_transform(cars)

In [None]:
# Check ordinal encoding
feature_eng_pipeline.transformers_[1][1].transform(cars[['num-of-doors']])

Put all of them together to build a linear regression model that models `price`. Fit the pipeline and predict with it.

In [None]:
# Import classes
from sklearn.linear_model import LinearRegression

In [None]:
# Initialize Pipeline
pipe = Pipeline([
    ('feat_eng', feature_eng_pipeline), 
    ('lr', LinearRegression())
])

# Select X features
X = cars[quantitative_cols+ordinal_cols+nominal_cols]

# Select y features
y = cars['price']

# Fit Pipeline
pipe.fit(X, y)

In [None]:
# Predict
pipe.predict(X)

In [None]:
# Evaluate model
pipe.score(X, y)

In [None]:
pipe.named_steps