In [10]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import pandas as pd

In [11]:
cancer = load_breast_cancer()
cancer_data = cancer.data
cancer_label = cancer.target


In [12]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [13]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [14]:
X_train, X_test, y_train, y_test = train_test_split(cancer_data,
                                                    cancer_label,
                                                    test_size=0.2,
                                                    random_state=15)

decision_tree = DecisionTreeClassifier(random_state=15)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:",accuracy)
print(classification_report(y_test, y_pred))

accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.97      0.87      0.92        39
           1       0.94      0.99      0.96        75

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



In [15]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(cancer_data,
                                                    cancer_label,
                                                    test_size=0.2,
                                                    random_state=15)


random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:",accuracy)
print(classification_report(y_test, y_pred))

accuracy: 0.9385964912280702
              precision    recall  f1-score   support

           0       0.94      0.87      0.91        39
           1       0.94      0.97      0.95        75

    accuracy                           0.94       114
   macro avg       0.94      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



In [16]:
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:",accuracy)
print(classification_report(y_test, y_pred))

accuracy: 0.868421052631579
              precision    recall  f1-score   support

           0       0.93      0.67      0.78        39
           1       0.85      0.97      0.91        75

    accuracy                           0.87       114
   macro avg       0.89      0.82      0.84       114
weighted avg       0.88      0.87      0.86       114



In [17]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:",accuracy)
print(classification_report(y_test, y_pred))

accuracy: 0.8421052631578947
              precision    recall  f1-score   support

           0       0.89      0.62      0.73        39
           1       0.83      0.96      0.89        75

    accuracy                           0.84       114
   macro avg       0.86      0.79      0.81       114
weighted avg       0.85      0.84      0.83       114



In [18]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=3000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:",accuracy)
print(classification_report(y_test, y_pred))

accuracy: 0.8947368421052632
              precision    recall  f1-score   support

           0       0.86      0.82      0.84        39
           1       0.91      0.93      0.92        75

    accuracy                           0.89       114
   macro avg       0.89      0.88      0.88       114
weighted avg       0.89      0.89      0.89       114



DecisionTreeClassifier  0.95
RandomForestClassifier  0.94 
svm                     0.87
SGDClassifier           0.84
LogisticRegression      0.89


DecisionTreeClassifier,RandomForestClassifier에서 상대적으로 높은 정확도를 보인다