## 7.3 파이프라인
- 데이터 전처리와 학습모형을 연결해 코드 간결화

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

#### 파이프라인 적용 전

In [None]:
# 표준화 스케일링
from sklearn.preprocessing import StandardScaler
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 학습
clf_linear = LinearRegression()
clf_linear.fit(X_tn_std, y_tn)

#### 파이프라인 적용 후

In [None]:
from sklearn.pipeline import Pipeline

# 파이프라인
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

# 학습
linear_pipeline.fit(X_tn, y_tn)

#### 파이프라인 사용 전 전체코드

In [7]:
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

raw_boston = datasets.load_boston()

X = raw_boston.data
y = raw_boston.target

# 트레이닝 / 테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=7)

# 표준화 스케일링
std_scale = StandardScaler()
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 학습
clf_linear = LinearRegression()
clf_linear.fit(X_tn_std, y_tn)

# 예측
pred_linear = clf_linear.predict(X_te_std)

# 평가
mean_squared_error(y_te, pred_linear)

29.515137790197556

In [8]:
# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 7)

# 파이프라인
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

linear_pipeline.fit(X_tn, y_tn)

# 예측
pred_linear = linear_pipeline.predict(X_te)

# 평가
mean_squared_error(y_te, pred_linear)

29.515137790197556

# 7.4 그리드 서치

In [10]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

raw_iris = datasets.load_iris()

# 피처 타깃
X = raw_iris.data
y = raw_iris.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 표준화 스케일
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

best_accuracy = 0

for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    clf_knn = KNeighborsClassifier(n_neighbors=k)
    clf_knn.fit(X_tn_std, y_tn)
    knn_pred = clf_knn.predict(X_te_std)
    accuracy = accuracy_score(y_te, knn_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        final_k = {'k' : k}

print(final_k)
print(accuracy)

{'k': 3}
0.9736842105263158


## 7.6.2 분류문제에서의 성능 평가

In [11]:
# 정확도
from sklearn.metrics import accuracy_score
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
print(accuracy_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred, normalize=False))

0.5
2


In [14]:
# 정확도
from sklearn.metrics import f1_score
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
print(f1_score(y_true, y_pred, average='weighted'))

0.5


In [16]:
# 정확도
from sklearn.metrics import confusion_matrix
y_pred = [2, 0, 2, 2, 0, 1]
y_true = [0, 0, 2, 2, 0, 2]
confusion_matrix(y_pred, y_true)

array([[2, 0, 0],
       [0, 0, 1],
       [1, 0, 2]], dtype=int64)

In [17]:
# 정확도
from sklearn.metrics import classification_report
y_pred = [0, 1, 2, 2, 0]
y_true = [0, 0, 2, 1, 0]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       1.00      0.67      0.80         3
     class 1       0.00      0.00      0.00         1
     class 2       0.50      1.00      0.67         1

    accuracy                           0.60         5
   macro avg       0.50      0.56      0.49         5
weighted avg       0.70      0.60      0.61         5



## 7.6.3 회귀 문제에서 성능 평가

In [19]:
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(mean_absolute_error(y_true, y_pred))

0.5


In [20]:
from sklearn.metrics import mean_squared_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(mean_squared_error(y_true, y_pred))

0.375


In [21]:
from sklearn.metrics import r2_score
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(r2_score(y_true, y_pred))

0.9486081370449679


## 7.6.4 군집문제에서 성능 평가

In [23]:
from sklearn.metrics import silhouette_score
X = [[1, 2], [4, 5], [2, 1], [6, 7], [2, 3]]
labels = [0, 1, 0, 1, 0]
sil_score = silhouette_score(X, labels)
print(sil_score)

0.5789497702625118
