In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression 

from sklearn.metrics import classification_report
from sklearn.metrics import recall_score

In [2]:
seed = 42

In [3]:
cancer = load_breast_cancer()
cancer_data = cancer.data
cancer_label = cancer.target

In [16]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

malignant : 악성, 
benign : 양성

In [4]:
np.bincount(cancer_label)

array([212, 357])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(cancer_data, 
                                                    cancer_label, 
                                                    test_size=0.2, 
                                                    random_state=seed) 

In [6]:
X_train.shape,  y_train.shape, X_test.shape, y_test.shape

((455, 30), (455,), (114, 30), (114,))

In [7]:
np.bincount(y_train),np.bincount(y_test)

(array([169, 286]), array([43, 71]))

In [8]:
decision_tree = DecisionTreeClassifier(random_state=seed) 
decision_tree.fit(X_train, y_train) 
decision_y = decision_tree.predict(X_test)
print(classification_report(y_test, decision_y))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [9]:
random_forest = RandomForestClassifier(random_state=seed) 
random_forest.fit(X_train, y_train)
random_y = random_forest.predict(X_test)
print(classification_report(y_test, random_y))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [10]:
svm_model = svm.SVC(random_state=seed,kernel='linear')
svm_model = svm.SVC(random_state=seed)

svm_model.fit(X_train, y_train) 
svm_y = svm_model.predict(X_test)
print(classification_report(y_test, svm_y))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92        43
           1       0.92      1.00      0.96        71

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



In [11]:
sgd_model = SGDClassifier(random_state=seed)
sgd_model.fit(X_train, y_train) 
sgd_y = sgd_model.predict(X_test)
print(classification_report(y_test, sgd_y)) 

              precision    recall  f1-score   support

           0       1.00      0.74      0.85        43
           1       0.87      1.00      0.93        71

    accuracy                           0.90       114
   macro avg       0.93      0.87      0.89       114
weighted avg       0.92      0.90      0.90       114



In [12]:
logistic_model = LogisticRegression(random_state=seed,max_iter=10000)
logistic_model.fit(X_train, y_train) 
log_y = logistic_model.predict(X_test) 
print(classification_report(y_test, log_y))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



malignant(악성)와 benign(양성)의 비율이 약 37:63 이어서 각 평가 지표들의 weighted avg를 사용하였다.

암 진단에서 악성(0)을 양성(1)로 예측하면 안되기에 precision이 가장 중요한 평가 지표이다.

Random Forest가 0.97의 precision으로 가장 높은 성능을 보였고, 

Random Forest와 Logistic Regression이 f1-score와 recall에서 0.96으로 가장 높은 성능을 보였다.

Random Forest가 최적이 모델이다.