## 모듈 import

In [26]:
import sklearn                      # 필요한 라이브러리와 함수들을 호출

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import pandas as pd                 # pandas 라이브러리를 pd라는 약칭으로 호출

## 데이터 준비

In [27]:
breast_cancer = load_breast_cancer()    # wine data를 wine 변수에 저장

print(type(breast_cancer))              # 변수의 type 확인
print(dir(breast_cancer))               # 객체가 가진 변수와 메서드 확인


<class 'sklearn.utils.Bunch'>
['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


In [28]:
breast_cancer.data.shape                # 실행 결과 : 569개의 data와 30개의 값(판단 할 정보)들로 이뤄짐을 확인(row/column)

(569, 30)

In [31]:
breast_cancer_data = breast_cancer.data # breast_cancer dat를 변수에 저장

breast_cancer_data[0]                   # 실행 결과 : 0번 index의 값들을 확인(위에서 갯수를 확인한 30개의 값들로 판단된다)

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [32]:
breast_cancer.feature_names             # 실행 결과 : 위 30개의 값들이 어떤 의미의 값들인지 알 수 있다. (반경, 형태, 범위 등)

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [33]:
breast_cancer.target_names              # 실행 결과 : 주어진 데이터를 Malignant(악성) / Benign(양성) 두 결과로 분류함을 알 수 있다.

array(['malignant', 'benign'], dtype='<U9')

In [34]:
breast_cancer_label = breast_cancer.target        # target 컬럼을 breast_cancer_label 변수에 저장

print(breast_cancer_label.shape)                  # 실행 결과 : 569개의 dat들

breast_cancer_label                               # 실행 결과 : 악성 / 양성을 각각 0과 1로 분류함을 알 수 있다.

(569,)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

## 데이터 이해하기

* Feature Data 지정하기
* Label Data 지정하기
* Target Names 출력해 보기
* 데이터 Describe 해 보기

In [35]:
## Feature Data / Label Data 지정

breast_cancer_df = pd.DataFrame(data=breast_cancer_data, columns=breast_cancer.feature_names)
                                             # Array자료형을 Dataframe자료형으로 breast_cancer_df에 저장
breast_cancer_df                             # 실행 결과 : 569개의 데이터와 30개의 값(분류에 사용 될)들 확인

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [36]:
breast_cancer_df["label"] = breast_cancer.target   # 생성한 breast_cancer_df에 breast_cancer.target을 label이라는 컬럼으로 새로 추가

breast_cancer_df                                   # 실행 결과 : label 컬럼이 추가된 breast_cancer data frame을 확인

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [37]:
## Target Names 출력

breast_cancer.target_names                 # Target Names 출력 (양성/음성)

array(['malignant', 'benign'], dtype='<U9')

In [38]:
## Data Describe

print(breast_cancer.DESCR)                 # breast_cancer의 Describe 출력

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## Train / Test 데이터 분리

In [39]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data,         # Feature
                                                    breast_cancer_label,        # Label
                                                    test_size=0.2,              # 80%는 학습용 / 20%는 평가용으로 분리
                                                    random_state=7)             # 다른 모델과 동일한 조건으로 평가하기 위해 재현 가능한 랜덤값으로 섞음


print('X_train 개수: ', len(X_train),', X_test 개수: ', len(X_test))            # 배열 길이를 확인함으로써 train 갯수와 test 갯수를 확인
                                                                                # -> 이로써 test 할 준비가 완료되었음
                                                                                # 이전의 코드와 동일한 부분임

X_train 개수:  455 , X_test 개수:  114


## 다양한 모델로 학습시켜보기

* Decision Tree 사용해 보기
* Random Forest 사용해 보기
* SVM 사용해 보기
* SGD Classifier 사용해 보기
* Logistic Regression 사용해 보기

In [40]:
## 학습에 필요한 다양한 모델 import

from sklearn.tree import DecisionTreeClassifier           # 의사결정나무
from sklearn.ensemble import RandomForestClassifier       # 랜덤포레스트
from sklearn import svm                                   # Support Vector Machine
from sklearn.linear_model import SGDClassifier            # 확률적 경사 하강
from sklearn.linear_model import LogisticRegression       # 로지스틱 회귀

In [41]:
## Decision Tree

decision_tree = DecisionTreeClassifier(random_state=32)                 # 결정트리분류기의 객체를 만든다.
decision_tree.fit(X_train, y_train)                                     # 분류기에 x와 y의 훈련 데이터를 넣어 훈련 시킨다.
y_pred = decision_tree.predict(X_test)                                  # 훈련된 분류기에 X_test라는 테스트 데이터셋을 넣어 얼마나 예측했는지 확인한다.

print(classification_report(y_test, y_pred))                            # 결과를 지표로 확인하기 위해 classification_report를 활용해 y_test, y_pred 값을 넣어 확인한다.

              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114



In [42]:
## Random Forest

random_forest = RandomForestClassifier(random_state=64)                 # RandomForest분류기 객체를 생성

random_forest.fit(X_train, y_train)                                     # 훈련
y_pred = random_forest.predict(X_test)                                  # 예측

print(classification_report(y_test, y_pred))                            # 결과 지표를 확인

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.99      1.00      0.99        74

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



In [43]:
## SVM

svm_model = svm.SVC()                                 # 모델 객체를 만든다.

svm_model.fit(X_train, y_train)                       # 훈련
y_pred = svm_model.predict(X_test)                    # 예측

print(classification_report(y_test, y_pred))          # 결과 지표 확인

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



In [44]:
## SDG

sgd_model = SGDClassifier()                           # 모델 객체 생성

sgd_model.fit(X_train, y_train)                       # SGD 모델로 Train 데이터를 훈련
y_pred = sgd_model.predict(X_test)                    # SGD 모델로 Test 데이터를 예측

print(classification_report(y_test, y_pred))          # 결과 지표를 확인

              precision    recall  f1-score   support

           0       1.00      0.70      0.82        40
           1       0.86      1.00      0.92        74

    accuracy                           0.89       114
   macro avg       0.93      0.85      0.87       114
weighted avg       0.91      0.89      0.89       114



In [45]:
# Logistic Regression

logistic_model = LogisticRegression(max_iter=5000)    # 모델 객체 생성

logistic_model.fit(X_train, y_train)                  # Train 데이터 훈련
y_pred = logistic_model.predict(X_test)               # Test 데이터 예측

print(classification_report(y_test, y_pred))          # 결과 지표를 확인

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



## 모델을 평가해 보기

* Breast cancer dataset의 경우 유방암을 진단하기 위한 정보이므로, False Negative가 발생하면 치명적이므로 FN이 낮아야 하는 recall을 봐야한다. 
* 0 (음성)에 대한 예측결과에서 recall이 상대적으로 우월한 성적을 나타내는 Random Forest 모델이 적합하다고 생각한다.

## Discussion
* 이 전의 두 프로젝트는 주어진 데이터를 예측한 후에 accuracy를 보고 결과를 선택했는데, breast_cancer dataset은 주어진 데이터를 적절히 처리하여 정해진 모델을 이용하여 예측하는 것 뿐만이 아니라, 평가가 이뤄진 후에도 그 결과를 분석하는 것 또한 중요하다는 것을 배울 수 있었다.
* 단순히 코드나 알고리즘에 대한 이해 뿐만이 아니라, 분석하려는 데이터에 대한 이해도 함께 수반되어야 함을 느낄 수 있었다.