In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

breast_cancer = load_breast_cancer()
breast_cancer_data = breast_cancer.data
breast_cancer_label = breast_cancer.target
print(breast_cancer.target_names)

x_train, x_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label, test_size=0.3, random_state=66)

['malignant' 'benign']


# 다양한 모델로 학습 + 평가하기
## Decision Tree

In [2]:
from sklearn.tree import DecisionTreeClassifier

decision_tree=DecisionTreeClassifier(random_state=77)
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91        61
           1       0.95      0.95      0.95       110

    accuracy                           0.94       171
   macro avg       0.93      0.93      0.93       171
weighted avg       0.94      0.94      0.94       171

accuracy :  0.935672514619883
[[ 56   5]
 [  6 104]]


## Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier

random_forest=RandomForestClassifier(random_state=55)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        61
           1       0.97      0.97      0.97       110

    accuracy                           0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171

accuracy :  0.9649122807017544
[[ 58   3]
 [  3 107]]


## SVM Support Vector Machine

In [4]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(x_train, y_train)
y_pred = svm_model.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.75      0.83        61
           1       0.88      0.96      0.92       110

    accuracy                           0.89       171
   macro avg       0.90      0.86      0.87       171
weighted avg       0.89      0.89      0.89       171

accuracy :  0.8888888888888888
[[ 46  15]
 [  4 106]]


## SGDClassifier

In [5]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(x_train, y_train)
y_pred = sgd_model.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.77      0.86        61
           1       0.89      0.99      0.94       110

    accuracy                           0.91       171
   macro avg       0.93      0.88      0.90       171
weighted avg       0.92      0.91      0.91       171

accuracy :  0.9122807017543859
[[ 47  14]
 [  1 109]]


## Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(solver='liblinear')
logistic_model.fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93        61
           1       0.95      0.98      0.96       110

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171

accuracy :  0.9532163742690059
[[ 55   6]
 [  2 108]]


에러 발생 : ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
solver : 최적화에 사용할 알고리즘을 결정한다('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')
lbfgs : 준-뉴턴 방식(quasi-Newton methods)의 최적화 알고리즘 제한된 컴퓨터 메모리를 이용하여 기준 BFGS 알고리즘을 속도면에서 개선한 알고리즘

# 정리

breast_cancer 데이터셋은 악성을 양성으로 잘 못 판단하는 것 보다 양성을 악성으로 판단하는 것이 더 중요하기 때문에 Recall 값이 큰 모델을 선택했다.

breast_cancer 데이터셋의 경우 Random Forest 모델의 Recall 값이 0.96으로 가장 높게 나왔다

앞의 wine 데이터셋과 함께 생각해봤을 경우 target이 적고, feature가 많을 수록 Random Forest와 Logistic Regression이 가장 높은 결과값을 보여주는 것으로 생각된다.


In [7]:
# 최종 코드

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

breast_cancer = load_breast_cancer()
breast_cancer_data = breast_cancer.data
breast_cancer_label = breast_cancer.target
print(breast_cancer.target_names)

x_train, x_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label, test_size=0.3, random_state=22)

random_forest=RandomForestClassifier(random_state=55)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

print(f1_score(y_test, y_pred))

['malignant' 'benign']
0.9565217391304348
