1. Membaca dataset (load dataset)

In [1]:
from sklearn import datasets
import pandas as pd

bc = datasets.load_breast_cancer()
df_bc = pd.DataFrame(bc.data, columns = bc.feature_names)
df_bc["target"] = bc.target

x_bc = df_bc.iloc[:, :-1]
y_bc = df_bc["target"]

In [2]:
from sklearn.model_selection import train_test_split

x_train_bc, x_test_bc, y_train_bc, y_test_bc = train_test_split(
    x_bc, 
    y_bc, 
    test_size = 0.2, 
    random_state = 568
)

2. Melakukan Pembelajaran
3. Simpan Model Hasil Pembelajaran
4. Proses Preditiction

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def get_fit_model(model, x_train, y_train):
    return model.fit(x_train, y_train)

def get_y_pred(fit_model, x_test):
    y_pred = fit_model.predict(x_test)
    return y_pred

def get_all_metrics_score(y_test, y_pred):
    return [
        accuracy_score(y_test, y_pred), 
        precision_score(y_test, y_pred, average = "micro"),
        recall_score(y_test, y_pred, average = "micro"),
        f1_score(y_test, y_pred, average = "micro")
    ]

def get_confusion_matrix(y_test, y_pred):
    return confusion_matrix(y_test, y_pred)

In [21]:
import pickle
def saveModel(model, destPath):
    # pkl extension
    pickle.dump(model, open(destPath, 'wb'))
    
def loadModel(srcPath):
    # pkl extension
    return pickle.load(open(srcPath, 'rb'))

A. DecisionTreeClassifier

In [22]:
# DecisionTreeClassifier

from sklearn import tree
from sklearn.tree import export_text

dt = tree.DecisionTreeClassifier(criterion = "entropy", max_features = "sqrt", random_state = 40)
dt_fit = get_fit_model(dt, x_train_bc, y_train_bc)
saveModel(dt_fit, "./model/dt_fit.pkl")

In [23]:
dt_fit = loadModel("./model/dt_fit.pkl")
y_pred = get_y_pred(dt_fit, x_test_bc)
y_pred = dt_fit.predict(x_test_bc)


dt_bc_scores = get_all_metrics_score(y_test_bc, y_pred)
dt_bc_confusion_mat = get_confusion_matrix(y_test_bc, y_pred)

print(f"{dt_bc_scores}\n")
print(f"{dt_bc_confusion_mat}\n")
print(export_text(dt))

[0.9122807017543859, 0.9122807017543859, 0.9122807017543859, 0.9122807017543859]

[[43  6]
 [ 4 61]]

|--- feature_13 <= 30.83
|   |--- feature_21 <= 25.83
|   |   |--- feature_23 <= 989.45
|   |   |   |--- feature_25 <= 0.50
|   |   |   |   |--- class: 1
|   |   |   |--- feature_25 >  0.50
|   |   |   |   |--- feature_20 <= 15.45
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_20 >  15.45
|   |   |   |   |   |--- class: 0
|   |   |--- feature_23 >  989.45
|   |   |   |--- class: 0
|   |--- feature_21 >  25.83
|   |   |--- feature_25 <= 0.35
|   |   |   |--- feature_7 <= 0.05
|   |   |   |   |--- feature_14 <= 0.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_14 >  0.00
|   |   |   |   |   |--- feature_22 <= 113.95
|   |   |   |   |   |   |--- feature_21 <= 33.27
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- feature_21 >  33.27
|   |   |   |   |   |   |   |--- feature_23 <= 643.25
|   |   |   |   |   |   |   |   |--- class: 1
| 

B. Id3Estimator

In [24]:
# Id3Estimator
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from id3 import Id3Estimator

id3_estimator = Id3Estimator(prune = True, gain_ratio = True)
id3_estimator_fit = get_fit_model(id3_estimator, x_train_bc, y_train_bc)
saveModel(id3_estimator_fit, "./model/id3_estimator_fit.pkl")

In [25]:
id3_estimator_fit = loadModel("./model/id3_estimator_fit.pkl")
y_pred = get_y_pred(id3_estimator_fit, x_test_bc)

id3_bc_scores = get_all_metrics_score(y_test_bc, y_pred)
id3_bc_confusion_mat = get_confusion_matrix(y_test_bc, y_pred)

print(f"{id3_bc_scores}\n")
print(f"{id3_bc_confusion_mat}\n")

[0.9298245614035088, 0.9298245614035088, 0.9298245614035088, 0.9298245614035088]

[[44  5]
 [ 3 62]]



C. K Means

In [26]:
# K Means
from sklearn.cluster import KMeans

k_means = KMeans(n_init = "auto", n_clusters = 2, max_iter = 5000, random_state = 15)
k_means_fit = get_fit_model(k_means, x_train_bc, y_train_bc)
saveModel(k_means_fit, "./model/k_means_fit.pkl")

In [27]:
k_means_fit = loadModel("./model/k_means_fit.pkl")
y_pred = get_y_pred(k_means_fit, x_test_bc)

k_means_bc_scores = get_all_metrics_score(y_test_bc, y_pred)
k_means_bc_confusion_mat = get_confusion_matrix(y_test_bc, y_pred)

print(f"{k_means_bc_scores}\n")
print(f"{k_means_bc_confusion_mat}\n")

[0.8421052631578947, 0.8421052631578947, 0.8421052631578947, 0.8421052631578947]

[[31 18]
 [ 0 65]]



D. LogisticRegression

In [28]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter = 5000)
log_reg_fit = get_fit_model(log_reg, x_train_bc, y_train_bc)
saveModel(log_reg_fit, "./model/log_reg_fit.pkl")

In [29]:
log_reg_fit = loadModel("./model/log_reg_fit.pkl")
y_pred = get_y_pred(log_reg_fit, x_test_bc)

log_reg_bc_scores = get_all_metrics_score(y_test_bc, y_pred)
log_reg_bc_confusion_mat = get_confusion_matrix(y_test_bc, y_pred)

print(f"{log_reg_bc_scores}\n")
print(f"{log_reg_bc_confusion_mat}\n")

[0.9736842105263158, 0.9736842105263158, 0.9736842105263158, 0.9736842105263158]

[[47  2]
 [ 1 64]]



E. Neural_network

In [30]:
# Neural_network

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter = 5000, solver = "lbfgs")
mlp_fit = get_fit_model(mlp, x_train_bc, y_train_bc)
saveModel(mlp_fit, "./model/mlp_fit.pkl")

In [31]:
mlp_fit = loadModel("./model/mlp_fit.pkl")
y_pred = get_y_pred(mlp_fit, x_test_bc)

mlp_bc_scores = get_all_metrics_score(y_test_bc, y_pred)
mlp_bc_confusion_mat = get_confusion_matrix(y_test_bc, y_pred)

print(f"{mlp_bc_scores}\n")
print(f"{mlp_bc_confusion_mat}\n")

[0.9473684210526315, 0.9473684210526315, 0.9473684210526315, 0.9473684210526315]

[[45  4]
 [ 2 63]]



F. SVM

In [32]:
# SVM

from sklearn.svm import SVC

svc = SVC(kernel = 'linear')
svc_fit = get_fit_model(svc, x_train_bc, y_train_bc)
saveModel(svc_fit, "./model/svc_fit.pkl")

In [33]:
svc_fit = loadModel("./model/svc_fit.pkl")
y_pred = get_y_pred(svc_fit, x_test_bc)

svc_bc_scores = get_all_metrics_score(y_test_bc, y_pred)
svc_bc_confusion_mat = get_confusion_matrix(y_test_bc, y_pred)

print(f"{svc_bc_scores}\n")
print(f"{svc_bc_confusion_mat}\n")

[0.9649122807017544, 0.9649122807017544, 0.9649122807017544, 0.9649122807017544]

[[46  3]
 [ 1 64]]



In [34]:
index = [
    "DecisionTree",
    "K Means",
    "LogisticRegression",
    "Neural_network",
    "SVM"
]
columns = [
    "Accuracy", 
    "Precision", 
    "Recall", 
    "F1"
]

bc_algos_scores = [
    dt_bc_scores,
    k_means_bc_scores,
    log_reg_bc_scores,
    mlp_bc_scores,
    svc_bc_scores
]

bc_algos_metrics_score = pd.DataFrame(
    data = bc_algos_scores, 
    index = index, 
    columns = [["Breast Cancer" for i in range(len(columns))], columns])

In [35]:
bc_algos_metrics_score

Unnamed: 0_level_0,Breast Cancer,Breast Cancer,Breast Cancer,Breast Cancer
Unnamed: 0_level_1,Accuracy,Precision,Recall,F1
DecisionTree,0.912281,0.912281,0.912281,0.912281
K Means,0.842105,0.842105,0.842105,0.842105
LogisticRegression,0.973684,0.973684,0.973684,0.973684
Neural_network,0.947368,0.947368,0.947368,0.947368
SVM,0.964912,0.964912,0.964912,0.964912


In [36]:
print(f">>> TRAINING DATA <<<")

print(f"\n> SAMPLE\n")
print(x_train_bc)

print(f"\n> TARGET\n")
print(y_train_bc)

>>> TRAINING DATA <<<

> SAMPLE

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
49         13.49         22.30           86.91      561.0          0.08752   
564        21.56         22.39          142.00     1479.0          0.11100   
112        14.26         19.65           97.83      629.9          0.07837   
50         11.76         21.60           74.72      427.9          0.08637   
301        12.46         19.89           80.43      471.3          0.08451   
..           ...           ...             ...        ...              ...   
448        14.53         19.34           94.25      659.7          0.08388   
395        14.06         17.18           89.75      609.1          0.08045   
222        10.18         17.53           65.12      313.1          0.10610   
545        13.62         23.23           87.19      573.2          0.09246   
252        19.73         19.82          130.70     1206.0          0.10620   

     mean compactness  mean co

In [37]:
print(f">>> TESTING DATA <<<")

print(f"\n> SAMPLE\n")
print(x_test_bc)

print(f"\n> TARGET\n")
print(y_test_bc)

>>> TESTING DATA <<<

> SAMPLE

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
408        17.99         20.66          117.80      991.7          0.10360   
350        11.66         17.07           73.70      421.0          0.07561   
499        20.59         21.24          137.80     1320.0          0.10850   
73         13.80         15.79           90.43      584.1          0.10070   
414        15.13         29.81           96.71      719.5          0.08320   
..           ...           ...             ...        ...              ...   
417        15.50         21.08          102.90      803.1          0.11200   
333        11.25         14.78           71.38      390.0          0.08306   
213        17.42         25.56          114.50      948.0          0.10060   
14         13.73         22.61           93.60      578.3          0.11310   
65         14.78         23.94           97.40      668.3          0.11720   

     mean compactness  mean con