## 파이썬 머신러닝
# Pipeline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

In [10]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(2)), ('model', KNeighborsClassifier())])
#pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA()), ('model', LogisticRegression())])

In [11]:
pipe.fit(X_train, y_train)
s_train = pipe.score(X_train, y_train)
s_test = pipe.score(X_test, y_test)

display(s_train, s_test)

0.9285714285714286

0.9210526315789473

In [5]:
pipe.predict_proba([[3,3,3,3]])

array([[0., 0., 1.]])

- 중간 단계는 fit() 과 transform() 메서드를 가져야 한다.
- 마지막 단계는 fit() 메서드를 가지면 된다.
- 파이프라인의 각 단계의 변환기는 steps 와 named_steps 값으로 가져올 수 있다.

In [6]:
pipe.steps

[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('model',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=5, p=2,
             weights='uniform'))]

In [7]:
pipe.named_steps

{'scaler': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'model': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=5, p=2,
            weights='uniform')}

In [8]:
pipe.named_steps['scaler'].transform([[3,3,3,3,]])

array([[-0.36111111,  0.36363636,  0.33898305,  1.20833333]])

- 아래는 간단하게 파이프라인을 만드는 방법이다.

In [9]:
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target)

pipe2 = make_pipeline(StandardScaler(), PCA(2), SVC(C=100, gamma=0.01))
pipe2.fit(X_train, y_train)
pipe2.score(X_test, y_test)

0.951048951048951