# 실습문제: 유방암 분류

In [31]:
import pandas as pd
import seaborn as sns
from sklearn. datasets import load_breast_cancer
import warnings
warnings.filterwarnings('ignore')
from tqdm import trange, notebook 

In [32]:
breast_cancer = load_breast_cancer()

breast_cancer_df = pd.DataFrame(data = breast_cancer.data, columns = breast_cancer.feature_names)
breast_cancer_df['label'] = breast_cancer.target
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [33]:
breast_cancer_df.shape

(569, 31)

In [34]:
# 독립변수인 label 몇 개의 카테고리로 구성되었는지 확인
breast_cancer_df['label'].unique()

array([0, 1])

- 0, 1로 구성된 이진분류 문제 

In [35]:
# 0, 1 개수 파악 (0: 유방암, 1: 정상)
malignant = (breast_cancer_df['label'] == 0).sum()
benign = (breast_cancer_df['label'] == 1).sum()

print('유방암:{0}, 정상: {1}'.format(malignant, benign) )

유방암:212, 정상: 357


-  유방암 데이터가 더 적음
- 데이터 분할 시, 특정 데이터 유형에 쏠려서 분할이 되지 않도록 stratify 옵션

In [36]:
# 결측치 확인
breast_cancer_df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
label                      0
dtype: int64

In [37]:
# 중복값
breast_cancer_df.duplicated().sum()

0

In [38]:
from sklearn.model_selection import train_test_split

# 독립변수와 종속변수의 분할
X= breast_cancer_df.iloc[:,:30]
y= breast_cancer_df['label']

In [39]:
# 학습용, 데스트용 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [41]:
tree_model = DecisionTreeClassifier(random_state = 42)
neighbor_model = KNeighborsClassifier(n_neighbors = 42) # n_neighbors: 이웃 수, 기본값 5
svm_model = SVC(random_state = 42)
forest_model = RandomForestClassifier(n_estimators = 300,random_state = 42) # n_estimators 생성할 의사결정나무의 수, 기본값 100
logistic_model = LogisticRegression(random_state=42)
gbm_model =  GradientBoostingClassifier(random_state = 42)
xgb_model = XGBClassifier(n_estimators = 300, random_state = 42)
lgb_model = LGBMClassifier(n_estimators = 300, random_state= 42)

In [42]:
model_list = [tree_model, neighbor_model, svm_model, forest_model, logistic_model, gbm_model, xgb_model, lgb_model]

In [43]:
import numpy as np
for model in model_list:
    model.fit(X_train,y_train)
    score = model.score(X_test,y_test)
    model_name = model.__class__.__name__
    print('{0} 정확도: {1:.2f}'.format(model_name, score))

DecisionTreeClassifier 정확도: 0.91
KNeighborsClassifier 정확도: 0.91
SVC 정확도: 0.90
RandomForestClassifier 정확도: 0.97
LogisticRegression 정확도: 0.96
GradientBoostingClassifier 정확도: 0.95
XGBClassifier 정확도: 0.98
LGBMClassifier 정확도: 0.97


In [52]:
# 분류모델의 평가지표 확인 classifier_evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def classifier_evaluation(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print('정확도:{0:.2f}, 정밀도{1:.2f}, 재현율{2:.2f}, F1-score:{3:.2f}'.format(accuracy, precision,recall, f1))
    print('혼동행렬', confusion, sep = '\n') # sep 구분자

In [57]:
for model in model_list:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_name = model.__class__.__name__
    print('\n{0} 평가지표:'.format(model_name))
    classifier_evaluation(y_test, y_pred)


DecisionTreeClassifier 평가지표:
정확도:0.91, 정밀도0.91, 재현율0.96, F1-score:0.93
혼동행렬
[[44  9]
 [ 4 86]]

KNeighborsClassifier 평가지표:
정확도:0.91, 정밀도0.90, 재현율0.97, F1-score:0.93
혼동행렬
[[43 10]
 [ 3 87]]

SVC 평가지표:
정확도:0.90, 정밀도0.86, 재현율1.00, F1-score:0.92
혼동행렬
[[38 15]
 [ 0 90]]

RandomForestClassifier 평가지표:
정확도:0.97, 정밀도0.97, 재현율0.99, F1-score:0.98
혼동행렬
[[50  3]
 [ 1 89]]

LogisticRegression 평가지표:
정확도:0.96, 정밀도0.94, 재현율1.00, F1-score:0.97
혼동행렬
[[47  6]
 [ 0 90]]

GradientBoostingClassifier 평가지표:
정확도:0.95, 정밀도0.94, 재현율0.99, F1-score:0.96
혼동행렬
[[47  6]
 [ 1 89]]

XGBClassifier 평가지표:
정확도:0.98, 정밀도0.97, 재현율1.00, F1-score:0.98
혼동행렬
[[50  3]
 [ 0 90]]

LGBMClassifier 평가지표:
정확도:0.97, 정밀도0.96, 재현율0.99, F1-score:0.97
혼동행렬
[[49  4]
 [ 1 89]]


In [66]:
# 모델 성능 개선 최적 파라미터
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[100],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

In [67]:
grid_cv = GridSearchCV(forest_model, param_grid = params, cv = 2)
grid_cv.fit(X_train, y_train)

print('최적조건:',grid_cv.best_params_)
print('\n -- 테스트 결과 --')
y_pred = model.predict(X_test)
classifier_evaluation(y_test, y_pred)

최적조건: {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}

 -- 테스트 결과 --
정확도:0.97, 정밀도0.96, 재현율0.99, F1-score:0.97
혼동행렬
[[49  4]
 [ 1 89]]
