## Phase 3.32
# Building a Machine Learning Pipeline

## Common Preprocessing Tools

- Scalers
    - StandardScaler, MinMaxScaler
- Encoders
    - OneHotEncoder, LabelEncoder
- Imputers
    - SimpleImputer

We use these tools all the time. Let's have a look at a common workflow using these tools individually.

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Loading Data

In [2]:
# Loading data.
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [3]:
# Dropping the unique identifier column.
df.drop('name', axis=1, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [5]:
# Conduct train-test-split.
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('mpg', axis=1),
    df['mpg'],
    random_state=51)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((298, 7), (100, 7), (298,), (100,))

In [6]:
def print_mae(y_train, y_test, y_pred_train, y_pred_test):
    """Print MAE for both train and test data."""
    
    print(f'Train MAE:\n\t{mean_absolute_error(y_train, y_pred_train)}')
    print(f'Test MAE:\n\t{mean_absolute_error(y_test, y_pred_test)}')

# The most Basic Pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [8]:
# Pipeline()

In [9]:
# Get mini-example data.
little_x_cols = ['cylinders', 'displacement']

X_train_little = X_train[little_x_cols]
X_test_little = X_test[little_x_cols]

In [10]:
# Create and fit a pipeline.
little_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('clf', RandomForestRegressor(random_state=51))
    ]
)

little_pipeline

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', RandomForestRegressor(random_state=51))])

In [11]:
# Setting for visual.
set_config(display='diagram')

little_pipeline

In [12]:
# Fit and predict.
little_pipeline.fit(X_train_little, y_train)

y_pred_train_little = little_pipeline.predict(X_train_little)
y_pred_test_little = little_pipeline.predict(X_test_little)

In [13]:
# Get metrics.
print_mae(y_train, y_test, y_pred_train_little, y_pred_test_little)

Train MAE:
	2.3301444868433525
Test MAE:
	3.080961193638119


In [14]:
X_train_little

Unnamed: 0,cylinders,displacement
338,4,135.0
188,8,318.0
45,6,258.0
299,4,141.0
273,4,119.0
...,...,...
222,8,260.0
16,6,199.0
197,4,90.0
224,8,302.0


In [15]:
y_train

338    27.2
188    16.0
45     18.0
299    27.2
273    23.9
       ... 
222    17.0
16     18.0
197    29.0
224    15.0
57     24.0
Name: mpg, Length: 298, dtype: float64

In [16]:
y_pred_train_little

array([30.63626551, 15.1240219 , 17.39391169, 25.73222942, 30.30647262,
       15.69745493, 26.61218522, 13.72526407, 26.63136667, 29.73188095,
       33.16469896, 29.66072626, 24.10812183, 20.9357896 , 30.86667212,
       23.60014515, 23.60014515, 20.26333333, 26.61218522, 22.9505    ,
       19.49207934, 29.66072626, 15.1240219 , 25.60965952, 14.33986544,
       30.86667212, 15.47490497, 29.66072626, 12.18499134, 31.50028308,
       20.52796548, 15.69745493, 20.52796548, 24.38709524, 23.7775    ,
       13.72526407, 15.1240219 , 29.66072626, 22.24960925, 19.49207934,
       17.9994381 , 34.70081825, 14.16384524, 23.45240041, 15.47490497,
       13.72526407, 33.72020238, 13.35495238, 19.51708333, 30.86667212,
       23.45240041, 18.5512275 , 20.03896905, 23.45240041, 23.45240041,
       29.92063135, 14.63544838, 23.60014515, 13.571     , 16.90205528,
       30.30647262, 15.69745493, 22.79566667, 23.60014515, 30.30647262,
       19.49207934, 18.5512275 , 13.72526407, 24.38709524, 20.93

## Long-Hand Workflow

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298 entries, 338 to 57
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     298 non-null    int64  
 1   displacement  298 non-null    float64
 2   horsepower    293 non-null    float64
 3   weight        298 non-null    int64  
 4   acceleration  298 non-null    float64
 5   model_year    298 non-null    int64  
 6   origin        298 non-null    object 
dtypes: float64(3), int64(3), object(1)
memory usage: 18.6+ KB


In [18]:
X_train['cylinders'] = X_train['cylinders'].astype('object')
X_test['cylinders'] = X_test['cylinders'].astype('object')
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298 entries, 338 to 57
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     298 non-null    object 
 1   displacement  298 non-null    float64
 2   horsepower    293 non-null    float64
 3   weight        298 non-null    int64  
 4   acceleration  298 non-null    float64
 5   model_year    298 non-null    int64  
 6   origin        298 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 18.6+ KB


In [19]:
# Splitting data into feature-types.
numerical_cols = X_train.select_dtypes('number').columns.tolist()
categorical_cols = X_train.select_dtypes('object').columns.tolist()

print(f'Numerical Columns:\n\t{numerical_cols}')
print(f'Categorical Columns:\n\t{categorical_cols}')
print()

# Sanity check.
if len(numerical_cols + categorical_cols) == \
   len(set(numerical_cols + categorical_cols)):
    print('No overlap!')
else:
    print('Something is wrong...')

Numerical Columns:
	['displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
Categorical Columns:
	['cylinders', 'origin']

No overlap!


In [20]:
# Create objects. (Only processing X-data.)
imputer = SimpleImputer(missing_values=np.nan)
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)


# Process data.
X_train_num_processed = imputer.fit_transform(X_train[numerical_cols])
X_test_num_processed = imputer.transform(X_test[numerical_cols])

X_train_num_processed = scaler.fit_transform(X_train_num_processed)
X_test_num_processed = scaler.transform(X_test_num_processed)

X_train_cat_processed = ohe.fit_transform(X_train[categorical_cols])
X_test_cat_processed = ohe.transform(X_test[categorical_cols])

# Join data back together to look at.
X_train_processed_df = pd.DataFrame(
    np.concatenate([X_train_num_processed, X_train_cat_processed], axis=1),
    columns=numerical_cols + ohe.get_feature_names().tolist())
X_test_processed_df = pd.DataFrame(
    np.concatenate([X_test_num_processed, X_test_cat_processed], axis=1),
    columns=numerical_cols + ohe.get_feature_names().tolist())

# Sanity check.
X_train_processed_df.shape, X_test_processed_df.shape

((298, 13), (100, 13))

In [21]:
X_train_processed_df.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,model_year,x0_3,x0_4,x0_5,x0_6,x0_8,x1_europe,x1_japan,x1_usa
0,-0.567179,-0.556096,-0.581727,0.050686,1.30827,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.211255,1.184281,1.437699,-0.946427,-0.0178,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.628162,0.129507,-0.021039,-0.761777,-1.34387,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-0.508869,-0.898898,0.249801,3.411327,0.777842,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.72267,-0.213295,-0.682698,-0.244755,0.512628,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
# Create and train model.
clf = RandomForestRegressor(random_state=51)
clf.fit(X_train_processed_df, y_train)

In [23]:
# Check scores.
y_pred_train = clf.predict(X_train_processed_df)
y_pred_test = clf.predict(X_test_processed_df)

print(f'Train MAE:\n\t{mean_absolute_error(y_train, y_pred_train)}')
print(f'Test MAE:\n\t{mean_absolute_error(y_test, y_pred_test)}')

Train MAE:
	0.7030771812080531
Test MAE:
	1.9541100000000002


## Using a Pipeline

In [24]:
from sklearn.compose import ColumnTransformer

In [25]:
# Same process, using pipelines.
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan)),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into ColumnTransformer
transformer = ColumnTransformer(transformers=[
    ('num', num_pipe, numerical_cols),
    ('cat', cat_pipe, categorical_cols) # ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Master Pipeline
master_pipe = Pipeline(steps=[
    ('ColumnTransformer', transformer),
    ('Classifier', RandomForestRegressor(random_state=51))
])
master_pipe

In [26]:
# Using the original, untransformed data!
master_pipe.fit(X_train, y_train)

print_mae(
    y_train, 
    y_test, 
    master_pipe.predict(X_train), 
    master_pipe.predict(X_test)
    )

Train MAE:
	0.7030771812080531
Test MAE:
	1.9541100000000002


### Access Pipeline Stages

In [27]:
master_pipe

In [28]:
# Getting individual elements.
master_pipe.named_steps

{'ColumnTransformer': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaler', StandardScaler())]),
                                  ['displacement', 'horsepower', 'weight',
                                   'acceleration', 'model_year']),
                                 ('cat',
                                  Pipeline(steps=[('ohe',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['cylinders', 'origin'])]),
 'Classifier': RandomForestRegressor(random_state=51)}

In [29]:
master_pipe.named_steps['ColumnTransformer'].named_transformers_

{'num': Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]),
 'cat': Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore'))])}

In [30]:
master_pipe.named_steps['ColumnTransformer'].named_transformers_['cat']

In [31]:
master_pipe.named_steps['ColumnTransformer'].named_transformers_['cat'].named_steps['ohe']

In [32]:
(master_pipe
 .named_steps['ColumnTransformer']
 .named_transformers_['cat']
 .named_steps['ohe'])

# Time to practice!