## (1) 필요한 모듈 import하기

In [1]:
# 분리
from sklearn.model_selection import train_test_split

# 모델링
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

# report
from sklearn.metrics import classification_report

## (2) 데이터 준비

In [2]:
# 데이터셋 불러오기
from sklearn.datasets import load_breast_cancer

## (3) 데이터 이해하기

In [3]:
breast_cancer = load_breast_cancer()

In [4]:
breast_cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [5]:
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
data = breast_cancer.data
target = breast_cancer.target

## (4) train, test 데이터 분리

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=77)
X_train

array([[1.242e+01, 1.504e+01, 7.861e+01, ..., 4.052e-02, 2.901e-01,
        6.783e-02],
       [1.825e+01, 1.998e+01, 1.196e+02, ..., 1.932e-01, 3.063e-01,
        8.368e-02],
       [1.454e+01, 2.754e+01, 9.673e+01, ..., 1.712e-01, 4.218e-01,
        1.341e-01],
       ...,
       [1.727e+01, 2.542e+01, 1.124e+02, ..., 1.739e-01, 2.500e-01,
        7.944e-02],
       [1.185e+01, 1.746e+01, 7.554e+01, ..., 9.140e-02, 3.101e-01,
        7.007e-02],
       [1.403e+01, 2.125e+01, 8.979e+01, ..., 7.963e-02, 2.226e-01,
        7.617e-02]])

## (5) 다양한 모델로 학습시켜보기

In [8]:
# DecisionTree
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [9]:
# RandomForest
model_random_forest = RandomForestClassifier()
model_random_forest.fit(X_train, y_train)

RandomForestClassifier()

In [10]:
# SVM
model_svc = SVC()
model_svc.fit(X_train, y_train)

SVC()

In [11]:
# SGD
model_sgd = SGDClassifier()
model_sgd.fit(X_train, y_train)

SGDClassifier()

In [12]:
# Logistic Regression
model_LR = LogisticRegression()
model_LR.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

## (6) 모델을 평가해 보기

In [13]:
# Decision Tree
y_pred = model_tree.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91        40
           1       0.95      0.96      0.95        74

    accuracy                           0.94       114
   macro avg       0.93      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114



In [14]:
# Random Forest
y_pred = model_random_forest.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93        36
           1       0.99      0.95      0.97        78

    accuracy                           0.96       114
   macro avg       0.94      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



In [15]:
# SVM
y_pred = model_svc.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.77      0.97      0.86        31
           1       0.99      0.89      0.94        83

    accuracy                           0.91       114
   macro avg       0.88      0.93      0.90       114
weighted avg       0.93      0.91      0.92       114



In [16]:
# SGD
y_pred = model_sgd.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.80      0.84        44
           1       0.88      0.94      0.91        70

    accuracy                           0.89       114
   macro avg       0.89      0.87      0.88       114
weighted avg       0.89      0.89      0.88       114



In [17]:
# Logistic Regression
y_pred = model_LR.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        37
           1       0.97      0.95      0.96        77

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114



Q 모델의 성능을 평가하는 지표로는 무엇이 좋을까요? 
sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 
선택하신 이유도 설명해 주세요.

A Random Forest, logistic Regression를 평가 지표로 사용하는 것이 가장 적절할 것 같다. 
암은 반드시 발견되어야 하고, 단 한 명의 환자도 놓치면 안되기 때문에 recall이 안정적인 모델을 택했다.