In [2]:
import six
import sys
sys.modules['sklearn.externals.six'] = six

import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import pickle

from id3 import Id3Estimator
import id3.export

ModuleNotFoundError: No module named 'pandas'

In [None]:
'''
    TASK 1
    Membaca dataset breast cancer
'''

# load dataset
data = load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Seperate feature and target
feat = df.iloc[:, :-1]
target = df['target']

# Check value for target function and feature
# print("x_bc", x_bc[:5])
# print("y_bc", y_bc.unique())


# Separate datasets into 80% training data and 20% test data
data_train, data_test, target_train, target_test = train_test_split(feat, target, test_size=0.2, random_state=420)

'''
    OUTPUT:
    data_train: data yang digunakan sebagai data training
    data_test: data yang digunakan sebagai data testing
    target_train: target atau label yang sesuai dengan data training
    target_test: target atau label yang sesuai dengan data testing

    INPUT:
    feat: atribut dari dataset yang akan dibagi menjadi data training dan data testing
    target: target atau label dari dataset yang akan dibagi menjadi data training dan data testing
    test_size=0.2: ukuran data testing, di sini diatur sebesar 20% dari seluruh dataset
    random_state=42: seed atau biji acak yang digunakan untuk memastikan hasil pengacakan selalu sama setiap kali kode dijalankan, 
    sehingga hasil yang diperoleh dapat direproduksi.
'''

#Checking the result
print(len(data.data)) 
print(len(data_test))
print(len(data_train))
print('test',len(data_test)/len(data.data))
print('train',len(data_train)/len(data.data))


569
114
455
test 0.20035149384885764
train 0.7996485061511424


In [None]:

'''
Function for a train and prediction
->def get_predictions(model,data_train, target_train, data_test)
    param for input :
    1. model -> learning model
    2. data_train -> data training
    3. target_train -> target training 
    4. data_test  -> data test
    This function will return the prediction of    
-> def get_score(model, data_train, target_train, data_test, target_test)
    param for input :
    1. model -> learning model
    2. data_train -> data training
    3. target_train -> target training
    4. data_test -> data test
    5. target_test -> target training
    This function will return the value of all the accuracy score and f1_score of the entire prediction 
    as dictionary
'''

def get_prediction(model, model_name, data_train, target_train, data_test):
    model  = model.fit(data_train, target_train)
    save_to_pickle(model, model_name)
    return model.predict(data_test)

def save_to_pickle(model,filename):
    with open(filename, 'wb') as file:
        pickle.dump(model,file)
        
def get_score(model, model_name, data_train, target_train, data_test, target_test):
    prediction = get_prediction(model, model_name, data_train, target_train, data_test)
    return {
        "accuracy_score": accuracy_score(target_test, prediction),
        "precision_score": precision_score(target_test, prediction),
        "recall_score": recall_score(target_test, prediction),
        "f1_score": f1_score(target_test, prediction, average='micro'),
        "confusion_matrix": confusion_matrix(target_test, prediction)

    }

In [None]:
'''
    DECISION TREE CLASSIFIER

    Create learning model with decision tree classfier
    Train the data and get score

    Param:
    1. criterion = entropy -> use Information Gain measurement in selecting 
    the best feature for splitting
    2. max_features = auto ->  select the best feature considering the square root 
    of the number of features
    3. random state -> set seed for the algorithm's randomization
'''

dtl = DecisionTreeClassifier(criterion="entropy", max_features="auto", random_state=33)

dt_score = get_score(dtl, "DTL.pkl", data_train, target_train, data_test, target_test)
print(dt_score)


{'accuracy_score': 0.9736842105263158, 'precision_score': 0.9710144927536232, 'recall_score': 0.9852941176470589, 'f1_score': 0.9736842105263158, 'confusion_matrix': array([[44,  2],
       [ 1, 67]], dtype=int64)}


In [None]:
'''
    ID3

    Create learning model with ID3
    Train the data and get score

    Param:
    1. prune = True -> the resulting decision tree will be pruned to prevent overfitting
    2. gain_ratio = True -> use the gain ratio metric to measure the information 
    value of each feature in splitting the dataset
'''

id3 = Id3Estimator(prune=True, gain_ratio=True)
id3_score = get_score(id3, "ID3.pkl", data_train, target_train, data_test, target_test)

print(id3_score)

{'accuracy_score': 0.9210526315789473, 'precision_score': 0.927536231884058, 'recall_score': 0.9411764705882353, 'f1_score': 0.9210526315789473, 'confusion_matrix': array([[41,  5],
       [ 4, 64]], dtype=int64)}


In [None]:
'''
    K-Means

    Create learning model with K-Means
    Train the data and get score

    Param:
    1. n_cluesters = 2 -> number of clusters = 2
    2. max_iter = 10000 -> maximum number of iterations the K-Means algorithm = 10000
    3. random_state = 13 ->  seed for the random number generator used by the K-Means algorithm = 13
'''

# setting the random_state will make the result remain the same for every run
kmeans = KMeans(n_clusters=2, max_iter=10000, random_state=13) 
kmeans_score = get_score(kmeans, "K-MEANS.pkl", data_train, target_train, data_test, target_test)

print(kmeans_score)

{'accuracy_score': 0.868421052631579, 'precision_score': 0.8271604938271605, 'recall_score': 0.9852941176470589, 'f1_score': 0.868421052631579, 'confusion_matrix': array([[32, 14],
       [ 1, 67]], dtype=int64)}


In [None]:
'''
    Logistic Regression

    Create learning model with K-Means
    Train the data and get score
    Param :
    max_iter = 10000 -> maximum number of iterations the Logistic Regression algorithm 

'''

logres = LogisticRegression(max_iter=10000)
logres_score = get_score(logres,'LOGRES.pkl', data_train, target_train, data_test, target_test)

print(logres_score)

{'accuracy_score': 0.9473684210526315, 'precision_score': 0.9558823529411765, 'recall_score': 0.9558823529411765, 'f1_score': 0.9473684210526315, 'confusion_matrix': array([[43,  3],
       [ 3, 65]], dtype=int64)}


In [None]:
'''
    Multilayer Perceptron (MLP)

    Create learning model with K-Means
    Train the data and get score

    Param:
    1. n_cluesters = 50000 -> maximum number of iterations for the solver to converge = 50000
    2. solver = lbfgs -> optimization solver algorithm to be used = lbfgs
    The 'lbfgs' solver is used to optimize the weights and bias parameters of the network

     "Limited-memory Broyden-Fletcher-Goldfarb-Shanno" 
     and is a quasi-Newton method to approximate the Newton-Raphson algorithm.

'''

mlp = MLPClassifier(max_iter=50000, solver="lbfgs")
mlp_score = get_score(mlp,'MLP.pkl',data_train, target_train, data_test, target_test)

print(mlp_score)


{'accuracy_score': 0.9649122807017544, 'precision_score': 0.9848484848484849, 'recall_score': 0.9558823529411765, 'f1_score': 0.9649122807017544, 'confusion_matrix': array([[45,  1],
       [ 3, 65]], dtype=int64)}


In [None]:
'''
    Support Vector Machine (SVM)

    Create learning model with SVM
    Train the data and get score

    Param:
    1. kernel = linear -> linear decision boundary will be used to separate the data into classes
'''

svc = SVC(kernel='linear')
svc_score = get_score(svc,'SVC.pkl', data_train, target_train, data_test, target_test)

print(svc_score)

{'accuracy_score': 0.9649122807017544, 'precision_score': 0.9705882352941176, 'recall_score': 0.9705882352941176, 'f1_score': 0.9649122807017544, 'confusion_matrix': array([[44,  2],
       [ 2, 66]], dtype=int64)}


In [None]:
cv_results = cross_validate(dtl, feat, target, cv=10, scoring=('accuracy', 'f1'))
print("Accuracy: ", cv_results['test_accuracy'].mean())
print("Precision: ", cv_results['test_precision'].mean())
print("Recall: ", cv_results['test_recall'].mean())
print("F1-Score: ", cv_results['test_f1'].mean())
print(cv_results)

{'fit_time': array([0.00401497, 0.00399876, 0.00208211, 0.00400805, 0.00195789,
       0.0030992 , 0.00288773, 0.00209188, 0.00308895, 0.00300312]), 'score_time': array([0.00199986, 0.00200272, 0.00153089, 0.00107574, 0.00196886,
       0.00100112, 0.00199842, 0.00191355, 0.00100803, 0.00089836]), 'test_accuracy': array([0.98245614, 0.92982456, 0.9122807 , 0.89473684, 0.94736842,
       0.96491228, 0.96491228, 0.92982456, 0.94736842, 0.91071429]), 'test_f1': array([0.98550725, 0.94444444, 0.92957746, 0.91666667, 0.96      ,
       0.97142857, 0.97297297, 0.94444444, 0.95652174, 0.92753623])}


In [None]:
'''
Analisis


'''