In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn.datasets import load_breast_cancer


In [2]:
breast = load_breast_cancer()

In [4]:
print(dir(breast))

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


In [5]:
data = breast.data
label = breast.target

In [6]:
print(breast.target_names)

['malignant' 'benign']


In [7]:
print(breast.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.2, stratify = label, random_state = 42)

In [9]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
sgd = SGDClassifier()
ld = LogisticRegression()

In [10]:
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(classification_report(y_test, dt_pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89        42
           1       0.96      0.90      0.93        72

    accuracy                           0.91       114
   macro avg       0.90      0.92      0.91       114
weighted avg       0.92      0.91      0.91       114



In [11]:
print('accuracy :', accuracy_score(y_test, dt_pred))
print('recall :', recall_score(y_test, dt_pred))
print('precision :',precision_score(y_test, dt_pred))
print('f1 score :', f1_score(y_test, dt_pred))

accuracy : 0.9122807017543859
recall : 0.9027777777777778
precision : 0.9558823529411765
f1 score : 0.9285714285714286


In [12]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.96      0.94      0.95        72

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114



In [13]:
print('accuracy :', accuracy_score(y_test, rf_pred))
print('recall :', recall_score(y_test, rf_pred))
print('precision :',precision_score(y_test, rf_pred))
print('f1 score :', f1_score(y_test, rf_pred))

accuracy : 0.9385964912280702
recall : 0.9444444444444444
precision : 0.9577464788732394
f1 score : 0.951048951048951


In [14]:
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90        42
           1       0.92      0.97      0.95        72

    accuracy                           0.93       114
   macro avg       0.93      0.91      0.92       114
weighted avg       0.93      0.93      0.93       114



In [15]:
print('accuracy :', accuracy_score(y_test, svm_pred))
print('recall :', recall_score(y_test, svm_pred))
print('precision :',precision_score(y_test, svm_pred))
print('f1 score :', f1_score(y_test, svm_pred))

accuracy : 0.9298245614035088
recall : 0.9722222222222222
precision : 0.9210526315789473
f1 score : 0.9459459459459458


In [16]:
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)
print(classification_report(y_test, sgd_pred))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90        42
           1       0.93      0.96      0.95        72

    accuracy                           0.93       114
   macro avg       0.93      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114



In [17]:
print('accuracy :', accuracy_score(y_test, sgd_pred))
print('recall :', recall_score(y_test, sgd_pred))
print('precision :',precision_score(y_test, sgd_pred))
print('f1 score :', f1_score(y_test, sgd_pred))

accuracy : 0.9298245614035088
recall : 0.9583333333333334
precision : 0.9324324324324325
f1 score : 0.9452054794520548


In [18]:
ld.fit(X_train, y_train)
ld_pred = ld.predict(X_test)
print(classification_report(y_test, ld_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
print('accuracy :', accuracy_score(y_test, ld_pred))
print('recall :', recall_score(y_test, ld_pred))
print('precision :',precision_score(y_test, ld_pred))
print('f1 score :', f1_score(y_test, ld_pred))

accuracy : 0.956140350877193
recall : 0.9722222222222222
precision : 0.958904109589041
f1 score : 0.9655172413793104


가장 성능이 좋은 모델은 Logistic Regression이다. 이는 binary classification이기 때문이지 않을까 생각된다.

이번 데이터의 경우 label이 212 - Malignant, 357 - Benign 로 약간의 불균형을 가지는 것을 확인 할 수 있다. 이 경우 accuracy로 진행하게 되면 한쪽에 편향된 결과에 대해 좋은 모델이라고 판단할 수 있기 때문에 recall과 precision을 이용한 조화평균인 f1 score를 이용하는 것이 적절하다고 생각된다.