# 보팅

In [10]:
# 필요한 모듈과 데이터 불러오기
import pandas as pd
import numpy  as np

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns = cancer.feature_names)
data_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [5]:
# 보팅 적용을 위한 개별 모델은 로지스틱 회귀와 KNN입니다.
logistic_regression = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=8)

# 개별모델을 소프트보팅 기반의 앙상블 모델로 구현한 분류기
voting_model = VotingClassifier(estimators=[ ('LogisticRegression', logistic_regression), ('KNN', knn)], 
                                voting='soft')

# 데이터를 훈련셋과 테스트셋으로 나누기
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=156)

# 보팅 분류기의 학습/예측/평가
voting_model.fit(X_train, y_train)
pred = voting_model.predict(X_test)
print('보팅 분류기의 정확도: {0: .4f}'.format(accuracy_score(y_test, pred)))

# 개별 모델의 학습/예측/평가
classifiers = [logistic_regression, knn]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    class_name = classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

보팅 분류기의 정확도:  0.9474
LogisticRegression 정확도: 0.9386
KNeighborsClassifier 정확도: 0.9386


# 랜덤 포레스트 훈련

In [7]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf = RandomForestClassifier (n_jobs=-1 ,random_state=42)
scores = cross_validate(rf ,X_train , y_train,
                       return_train_score=True ,n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.9626373626373625


In [12]:
rf.fit(X_train,y_train)
print(rf.feature_importances_)

[0.05552102 0.01504054 0.07287499 0.05176543 0.00535215 0.0096741
 0.04780472 0.09027433 0.00330074 0.0039983  0.01370201 0.00237361
 0.00899107 0.03353481 0.00511289 0.00654555 0.00362744 0.00302006
 0.00283228 0.00411576 0.10305531 0.01502146 0.08819606 0.16024923
 0.0115159  0.01755399 0.03542628 0.11520387 0.010327   0.00398912]


In [14]:
rf= RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(X_train,y_train)
print(rf.oob_score_)

0.9604395604395605


# XGBoost vs LightGBM

In [16]:
from xgboost import XGBClassifier

In [18]:
xgb = XGBClassifier(tree_method='hist' ,random_state=42)
scores =cross_validate(xgb,X_train,y_train,return_train_score=True,n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.9516483516483516


In [20]:
from lightgbm import LGBMClassifier

In [22]:
lgb=LGBMClassifier(random_state=42)
scores =cross_validate(lgb,X_train,y_train,return_train_score=True,n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

1.0 0.9604395604395604
