## 보팅방식 모델 구현 <hr>
- 데이터 : sklearn.datasets 의 breast_cancer
- 유  형 : 지도학습 + 분류
- 방  법 : Voting방식으로 진행 => LogisticRegression, DecisionTreeClassifier, SVC
- 학습데이터셋 : 동일한 데이터셋으로 3개의 모델로 학습 진행

[1] 모듈 로딩 및 데이터 준비 <hr>

In [89]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

In [90]:
X, y = load_breast_cancer(as_frame=True, return_X_y=True)

In [91]:
X.shape, y.shape

((569, 30), (569,))

In [92]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=17, stratify=y)

[2] 학습 진행 <hr>

[2-1] 앙상블 보팅 학습에 사용할 모델 인스턴스 생성

In [93]:
# LogisticRegression 인스턴스

from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train, y_train)

In [94]:
# DecisionTreeClassifier 인스턴스
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

In [95]:
# SVC 인스턴스
from sklearn.svm import SVC
svc_model = SVC(probability=True)
svc_model.fit(X_train, y_train)

[2-2] Ensemble 알고리즘 기반 분류

In [96]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [97]:
# 동일 데이터셋으로 병렬 학습 진행할 모델 리스트 선정 및 결과 결정 방법 설정
vt_models = VotingClassifier(estimators=[('lr_model', lr_model), 
                                        ('dt_model', dt_model), 
                                        ('svc_model', svc_model)],
                            verbose=True, voting='soft'
                            )

In [98]:
# 동일 데이터셋을 전달해서 3개의 모델 동시에 학습 진행
vt_models.fit(X_train, y_train)

[Voting] ................. (1 of 3) Processing lr_model, total=   0.0s
[Voting] ................. (2 of 3) Processing dt_model, total=   0.0s
[Voting] ................ (3 of 3) Processing svc_model, total=   0.0s


In [99]:
# 예측하기
new_data = pd.DataFrame([X_test.iloc[0]], columns=X_test.columns)

vt_models.predict_proba([X_test.iloc[0]])




array([[0.72446766, 0.27553234]])

In [100]:
X_test.iloc[0].to_frame().T

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072


In [101]:
# 예측하기
vt_models.predict(new_data)
# vt_models.predict_proba(new_data)           # hard가 아니라 soft일때만 proba 가능 : logistic은 잇으나, SVC는 메서드 등..

array([0])

In [102]:
# 보팅 인스턴스 내의 학습기들 => 접근 방법 (1)
# 
vt_models.estimators_

[LogisticRegression(solver='liblinear'),
 DecisionTreeClassifier(),
 SVC(probability=True)]

In [103]:
# 보팅 인스턴스 내의 학습기들 => 접근 방법 (2)
vt_models.named_estimators_.get('lr_model').predict(new_data), vt_models.named_estimators_.get('dt_model').predict(new_data), vt_models.named_estimators_.get('svc_model').predict(new_data)

(array([0], dtype=int64), array([0], dtype=int64), array([1], dtype=int64))

In [104]:
for key, value in vt_models.named_estimators_.items():
    print(key,value.predict(new_data)[0], value.predict_proba(new_data)[0])

lr_model 0 [0.93004942 0.06995058]
dt_model 0 [1. 0.]
svc_model 1 [0.24335357 0.75664643]
