In [1]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd

bc = datasets.load_breast_cancer()

df = pd.DataFrame(bc.data, columns=bc.feature_names)
print(df.shape)
df.head()

(569, 30)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
x_train, x_test, y_train, y_test = train_test_split(
    bc.data, bc.target, test_size=0.2, shuffle=True, stratify=bc.target, random_state=42
)

In [4]:
from collections import Counter

# 라벨 균형 확인
print(Counter(y_train))
print(Counter(y_test))

Counter({1: 285, 0: 170})
Counter({1: 72, 0: 42})


## Logistic Regression

In [5]:
# 모델 훈련
logistic_model = LogisticRegression(penalty='l1', solver='liblinear')
logistic_model.fit(x_train, y_train)

# 모델 테스트
lr_acc = logistic_model.score(x_test, y_test)
print("Logistic Regression 테스트셋 정확도 :", lr_acc)

Logistic Regression 테스트셋 정확도 : 0.9649122807017544




## SVM

In [6]:
from sklearn.svm import SVC

svm = SVC(C=10, kernel='linear')
svm.fit(x_train, y_train)

svm_acc = svm.score(x_test, y_test)
print("SVM 테스트셋 정확도 :", svm_acc)

SVM 테스트셋 정확도 : 0.9649122807017544


## 나이브 베이즈 분류

In [7]:
from sklearn.naive_bayes import GaussianNB

naive_model = GaussianNB()
naive_model.fit(x_train, y_train)

naive_acc = naive_model.score(x_test, y_test)
print("나이브 베이즈 테스트셋 정확도 :", naive_acc)

나이브 베이즈 테스트셋 정확도 : 0.9385964912280702


## KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

best_k = 0
best_score = 0

for k in range(5, 35, 5):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    knn_acc = knn.score(x_test, y_test)
    print("K :", k)
    print("KNN 테스트셋 정확도 :", knn_acc)

K : 5
KNN 테스트셋 정확도 : 0.9122807017543859
K : 10
KNN 테스트셋 정확도 : 0.9298245614035088
K : 15
KNN 테스트셋 정확도 : 0.9298245614035088
K : 20
KNN 테스트셋 정확도 : 0.9385964912280702
K : 25
KNN 테스트셋 정확도 : 0.9385964912280702
K : 30
KNN 테스트셋 정확도 : 0.9385964912280702


In [12]:
knn = KNeighborsClassifier(n_neighbors=20, weights='distance')
knn.fit(x_train, y_train)
knn_acc = knn.score(x_test, y_test)
print(knn_acc)

0.9473684210526315


In [13]:
df = pd.DataFrame(
    {
        "LR": [lr_acc],
        "SVM": [svm_acc],
        "NB": [naive_acc],
        "KNN": [knn_acc]
    },
    index=["Accuracy"]
)

df

Unnamed: 0,LR,SVM,NB,KNN
Accuracy,0.964912,0.964912,0.938596,0.947368


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import defaultdict

def get_scores(model):
    pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    return [round(accuracy, 2), round(precision, 2), round(recall, 2), round(f1, 2)]

scores = []

for model in [logistic_model, svm, naive_model, knn]:
    score = get_scores(model)
    scores.append(score)

df = pd.DataFrame(
    {"LR": scores[0], "SVM": scores[1], "naive": scores[2], "knn": scores[3]}, 
    index=["accuracy", "precisoin", "recall", "f1"]
)
df

Unnamed: 0,LR,SVM,naive,knn
accuracy,0.96,0.96,0.94,0.95
precisoin,0.96,0.96,0.95,0.95
recall,0.96,0.96,0.95,0.95
f1,0.97,0.97,0.95,0.96


각 모델의 성능은 하이퍼 파라미터 튜닝에 따라서 달라질 수 있다.

예를 들어, svc의 경우 C, kernel과 같은 값을 조절하면 성능이 크게 달라진다.

하이퍼 파라미터들은 아래 링크 참고

* Logistic Regression : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
* SVC : https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
* Naive Bayes : https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
* KNN : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html