# 19장 파이프라인

> 파이프라인 기능을 알아봅니다.
- author: "Chansung Park"
- toc: true
- comments: true
- permalink: /chapter19/
- badges: true
- sticky_rank: 19

In [1]:
#hide

!pip install tpot==0.9.5
!pip install dtreeviz==0.8.2
!pip install xgbfir
!pip install yellowbrick==0.9
!pip install lime
!pip install treeinterpreter
!pip install pdpbox
!pip install shap==0.25.2
!pip install scprep
!pip install phate==0.4.2
!pip install pydotplus
!pip install scikit-plot==0.3.7
!pip install scikit-learn==0.21.0
!pip install pandas==0.23.4

Collecting tpot==0.9.5
[?25l  Downloading https://files.pythonhosted.org/packages/c5/6c/f1806e5b31d65a195bcd94fd80079318d5a9e54c2239b395404c25496792/TPOT-0.9.5.tar.gz (891kB)
[K     |████████████████████████████████| 901kB 5.5MB/s 
Collecting deap>=1.0
[?25l  Downloading https://files.pythonhosted.org/packages/99/d1/803c7a387d8a7e6866160b1541307f88d534da4291572fb32f69d2548afb/deap-1.3.1-cp37-cp37m-manylinux2010_x86_64.whl (157kB)
[K     |████████████████████████████████| 163kB 9.3MB/s 
[?25hCollecting update_checker>=0.16
  Downloading https://files.pythonhosted.org/packages/0c/ba/8dd7fa5f0b1c6a8ac62f8f57f7e794160c1f86f31c6d0fb00f582372a3e4/update_checker-0.18.0-py3-none-any.whl
Collecting stopit>=1.1.1
  Downloading https://files.pythonhosted.org/packages/35/58/e8bb0b0fb05baf07bbac1450c447d753da65f9701f551dca79823ce15d50/stopit-1.1.2.tar.gz
Building wheels for collected packages: tpot, stopit
  Building wheel for tpot (setup.py) ... [?25l[?25hdone
  Created wheel for tpot: file

Collecting lime
[?25l  Downloading https://files.pythonhosted.org/packages/f5/86/91a13127d83d793ecb50eb75e716f76e6eda809b6803c5a4ff462339789e/lime-0.2.0.1.tar.gz (275kB)
[K     |█▏                              | 10kB 15.2MB/s eta 0:00:01[K     |██▍                             | 20kB 10.5MB/s eta 0:00:01[K     |███▋                            | 30kB 8.6MB/s eta 0:00:01[K     |████▊                           | 40kB 7.2MB/s eta 0:00:01[K     |██████                          | 51kB 4.4MB/s eta 0:00:01[K     |███████▏                        | 61kB 4.7MB/s eta 0:00:01[K     |████████▎                       | 71kB 5.1MB/s eta 0:00:01[K     |█████████▌                      | 81kB 5.0MB/s eta 0:00:01[K     |██████████▊                     | 92kB 5.2MB/s eta 0:00:01[K     |███████████▉                    | 102kB 5.5MB/s eta 0:00:01[K     |█████████████                   | 112kB 5.5MB/s eta 0:00:01[K     |██████████████▎                 | 122kB 5.5MB/s eta 0:00:01[K    

In [1]:
#hide

import warnings
warnings.filterwarnings('ignore')

# 19.1 분류 파이프라인

In [3]:
import pandas as pd
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)
from sklearn.base import (
    BaseEstimator,
    TransformerMixin,
)
from sklearn.ensemble import (
    RandomForestClassifier,
)
from sklearn.pipeline import Pipeline
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

class TitanicTransformer(
    BaseEstimator, TransformerMixin
):
    def transform(self, X):
        # assumes X is output
        # from reading Excel file
        X = tweak_titanic(X)
        X = X.drop(columns="survived")
        return X
    def fit(self, X, y):
        return self
pipe = Pipeline(
    [
        ("titan", TitanicTransformer()),
        ("impute", impute.IterativeImputer()),
        (
            "std",
            preprocessing.StandardScaler(),
        ),
        ("rf", RandomForestClassifier()),
    ]
)

In [6]:
from sklearn.model_selection import (
    train_test_split,
)

url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"

df = pd.read_excel(url)
orig_df = df

In [7]:
from sklearn.model_selection import (
  train_test_split,
)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
  orig_df,
  orig_df.survived,
  test_size=0.3,
  random_state=42,
)

pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.7938931297709924

In [8]:
params = {
  "rf__max_features": [0.4, "auto"],
  "rf__n_estimators": [15, 200],
}

grid = model_selection.GridSearchCV(
    pipe, cv=3, param_grid=params
)

grid.fit(orig_df, orig_df.survived)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('titan', TitanicTransformer()),
                                       ('impute',
                                        IterativeImputer(add_indicator=False,
                                                         estimator=None,
                                                         imputation_order='ascending',
                                                         initial_strategy='mean',
                                                         max_iter=10,
                                                         max_value=None,
                                                         min_value=None,
                                                         missing_values=nan,
                                                         n_nearest_features=None,
                                                         random_state=None,
          

In [9]:
grid.best_params_

{'rf__max_features': 'auto', 'rf__n_estimators': 200}

In [11]:
pipe.fit(X_train2, y_train2)
pipe.score(X_test2, y_test2)

0.7608142493638677

In [13]:
from sklearn import metrics

metrics.roc_auc_score(
  y_test2, pipe.predict(X_test2)
)

0.7429601648351648

# 19.2 회귀 파이프라인

In [16]:
#hide

import pandas as pd
from sklearn.datasets import load_boston
from sklearn import (
  model_selection,
  preprocessing,
)
b = load_boston()
bos_X = pd.DataFrame(
  b.data, columns=b.feature_names
)
bos_y = b.target

bos_X_train, bos_X_test, bos_y_train, bos_y_test = model_selection.train_test_split(
  bos_X,
  bos_y,
  test_size=0.3,
  random_state=42,
)

bos_sX = preprocessing.StandardScaler().fit_transform(bos_X)
bos_sX_train, bos_sX_test, bos_sy_train, bos_sy_test = model_selection.train_test_split(
  bos_sX,
  bos_y,
  test_size=0.3,
  random_state=42,
)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

reg_pipe = Pipeline(
  [
    (
      "std",
      preprocessing.StandardScaler(),
    ),
    ("lr", LinearRegression()),
  ]
)

reg_pipe.fit(bos_X_train, bos_y_train)
reg_pipe.score(bos_X_test, bos_y_test)

0.7112260057484932

In [18]:
reg_pipe.named_steps["lr"].intercept_

23.01581920903956

In [19]:
reg_pipe.named_steps["lr"].coef_

array([-1.10834602,  0.80843998,  0.34313466,  0.81386426, -1.79804295,
        2.913858  , -0.29893918, -2.94251148,  2.09419303, -1.44706731,
       -2.05232232,  1.02375187, -3.88579002])

In [20]:
metrics.mean_squared_error(
  bos_y_test, reg_pipe.predict(bos_X_test)
)

21.517444231177215

# 19.3 PCA 파이프라인

In [25]:
#hide

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)
from sklearn.ensemble import (
    RandomForestClassifier,
)
from yellowbrick.model_selection import (
    ValidationCurve,
)

url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"

df = pd.read_excel(url)
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

def get_train_test_X_y(
    df, y_col, size=0.3, std_cols=None
):
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=size, random_state=42
    )
    cols = X.columns
    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
    ]
    fi = impute.IterativeImputer()

    fitted = fi.fit_transform(X_train[num_cols])
    X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(num_cols)})
    test_fit = fi.transform(X_test[num_cols])
    X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(num_cols)})
    if std_cols:
        std = preprocessing.StandardScaler()
        fitted = std.fit_transform(X_train[std_cols])
        X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(std_cols)})
        test_fit = std.transform(X_test[std_cols])
        X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(std_cols)})

    return X_train, X_test, y_train, y_test

ti_df = tweak_titanic(df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(
    ti_df, "survived", std_cols=std_cols
)

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [26]:
from sklearn.decomposition import PCA


pca_pipe = Pipeline(
  [
    (
      "std",
      preprocessing.StandardScaler(),
    ),
    ("pca", PCA()),
  ]
)

X_pca = pca_pipe.fit_transform(X)

In [27]:
pca_pipe.named_steps["pca"].explained_variance_ratio_

array([0.23922833, 0.21616853, 0.1923158 , 0.10464906, 0.08154797,
       0.0727221 , 0.05130716, 0.04206107])

In [28]:
pca_pipe.named_steps["pca"].components_[0]

array([-0.63274156,  0.39602149,  0.00653646,  0.11500362,  0.5815031 ,
       -0.19764926, -0.20422289, -0.10304598])