# Tugas 1 Machine Learning Eksplorasi algoritme pembelajaran

13520016 - Gagas Praharsa Bahar  
13520029 - Muhammad Garebaldhie ER Rahman

# Import modules and libraries

In [1]:
import six
import sys
sys.modules['sklearn.externals.six'] = six
import pickle


from sklearn import tree
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from id3 import Id3Estimator

# Load breast cancer dataset

In [2]:
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2)

# Learning Models

## Decision Tree Classifier

### Do the learning process

In [3]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

### Visualization of Decision Tree 

In [4]:
tree_text = tree.export_text(clf, feature_names=list(cancer.feature_names))
print(tree_text)

|--- worst perimeter <= 106.05
|   |--- worst concave points <= 0.16
|   |   |--- perimeter error <= 6.60
|   |   |   |--- worst concave points <= 0.13
|   |   |   |   |--- radius error <= 0.64
|   |   |   |   |   |--- smoothness error <= 0.00
|   |   |   |   |   |   |--- mean symmetry <= 0.17
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- mean symmetry >  0.17
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- smoothness error >  0.00
|   |   |   |   |   |   |--- worst texture <= 33.27
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- worst texture >  33.27
|   |   |   |   |   |   |   |--- worst texture <= 33.56
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- worst texture >  33.56
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- radius error >  0.64
|   |   |   |   |   |--- mean perimeter <= 78.51
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- mean perimeter >

### Saving the models as a file

In [5]:
pickle.dump(clf, open('models/tree_model.sav', 'wb'))

### Load the model and do the prediction

In [25]:
tree_model = pickle.load(open('models/tree_model.sav', 'rb'))
y_pred_tree = tree_model.predict(x_test)

### Result of prediction

In [24]:
print("Decision Tree Classifier")

print("Accuracy: ", accuracy_score(y_test, y_pred_tree))
print("Precision: ", precision_score(y_test, y_pred_tree))
print("Recall: ", recall_score(y_test, y_pred_tree))
print("F1 Score: ", f1_score(y_test, y_pred_tree))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred_tree))

Decision Tree Classifier
Accuracy:  0.9122807017543859
Precision:  0.9696969696969697
Recall:  0.8888888888888888
F1 Score:  0.927536231884058
Confusion matrix:  [[40  2]
 [ 8 64]]


## ID3 Estimator

### Do the learning process

In [18]:
estimator = Id3Estimator()
estimator = estimator.fit(x_train, y_train)

### Saving the models as a file

In [19]:
pickle.dump(estimator, open("models/id3_estimator.sav", "wb"))

### Load the model and do the prediction

In [27]:
id3_model = pickle.load(open("models/id3_estimator.sav", "rb"))
y_pred_id3 = id3_model.predict(x_test)

### Result of prediction

In [26]:
print("Prediction using ID3 Estimator")

print("Accuracy: ", accuracy_score(y_test, y_pred_id3))
print("Precision: ", precision_score(y_test, y_pred_id3))
print("Recall: ", recall_score(y_test, y_pred_id3))
print("F1 Score: ", f1_score(y_test, y_pred_id3))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred_id3))

Prediction using ID3 Estimator
Accuracy:  0.9035087719298246
Precision:  0.9420289855072463
Recall:  0.9027777777777778
F1 Score:  0.9219858156028369
Confusion matrix:  [[38  4]
 [ 7 65]]


## KMeans

### Do the learning process

In [8]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(x_train)

### Saving the models as a file

In [28]:
pickle.dump(kmeans, open('models/kmeans_model.sav', 'wb'))

### Load the model and do the prediction

In [30]:
kmeans_model = pickle.load(open('models/kmeans_model.sav', 'rb'))
y_pred_kmeans = kmeans_model.predict(x_test)



### Result of prediction

In [31]:
print("KMeans Classifier")

print("Accuracy: ", accuracy_score(y_test, y_pred_kmeans))
print("Precision: ", precision_score(y_test, y_pred_kmeans))
print("Recall: ", recall_score(y_test, y_pred_kmeans))
print("F1 Score: ", f1_score(y_test, y_pred_kmeans))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred_kmeans))

KMeans Classifier
Accuracy:  0.13157894736842105
Precision:  0.0
Recall:  0.0
F1 Score:  0.0
Confusion matrix:  [[15 27]
 [72  0]]


## Logistic Regression

### Do the learning process

In [33]:
logisticRegr = LogisticRegression(max_iter=10000).fit(x_train, y_train)

### Saving the models as a file

In [32]:
pickle.dump(logisticRegr, open('models/logistic_model.sav', 'wb'))

### Load the model and do the prediction

In [11]:
logistic_model = pickle.load(open('models/logistic_model.sav', 'rb'))
y_pred_logistic = logistic_model.predict(x_test)

print("Logistic Regression Classifier")

print("Accuracy: ", accuracy_score(y_test, y_pred_logistic))
print("Precision: ", precision_score(y_test, y_pred_logistic))
print("Recall: ", recall_score(y_test, y_pred_logistic))
print("F1 Score: ", f1_score(y_test, y_pred_logistic))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred_logistic))

Logistic Regression Classifier
Accuracy:  0.9473684210526315
Precision:  0.9583333333333334
Recall:  0.9583333333333334
F1 Score:  0.9583333333333334
Confusion matrix:  [[39  3]
 [ 3 69]]


### Result of prediction

In [None]:
print("Logistic Regression Classifier")

print("Accuracy: ", accuracy_score(y_test, y_pred_logistic))
print("Precision: ", precision_score(y_test, y_pred_logistic))
print("Recall: ", recall_score(y_test, y_pred_logistic))
print("F1 Score: ", f1_score(y_test, y_pred_logistic))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred_logistic))

## Nerual Network

### Do the learning process

In [34]:
neural_network = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000).fit(x_train, y_train)

### Saving the models as a file

In [12]:
pickle.dump(neural_network, open('models/neural_model.sav', 'wb'))

### Load the model and do the prediction

In [13]:
neural_model = pickle.load(open('models/neural_model.sav', 'rb'))
y_pred_neural = neural_model.predict(x_test)

Neural Network Classifier
Accuracy:  0.9298245614035088
Precision:  0.9444444444444444
Recall:  0.9444444444444444
F1 Score:  0.9444444444444444
Confusion matrix:  [[38  4]
 [ 4 68]]


### Result of prediction

In [None]:
print("Neural Network Classifier")

print("Accuracy: ", accuracy_score(y_test, y_pred_neural))
print("Precision: ", precision_score(y_test, y_pred_neural))
print("Recall: ", recall_score(y_test, y_pred_neural))
print("F1 Score: ", f1_score(y_test, y_pred_neural))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred_neural))

## Support Vector Machines

### Do the learning process

In [35]:
svm = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(x_train, y_train)

### Saving the models as a file

In [36]:
pickle.dump(svm, open('models/svm_model.sav', 'wb'))

### Load the model and do the prediction

In [15]:
svm_model = pickle.load(open('models/svm_model.sav', 'rb'))
y_pred_svm = svm_model.predict(x_test)

SVM Classifier
Accuracy:  0.9912280701754386
Precision:  1.0
Recall:  0.9861111111111112
F1 Score:  0.993006993006993
Confusion matrix:  [[42  0]
 [ 1 71]]


### Result of prediction

In [37]:
print("SVM Classifier")

print("Accuracy: ", accuracy_score(y_test, y_pred_svm))
print("Precision: ", precision_score(y_test, y_pred_svm))
print("Recall: ", recall_score(y_test, y_pred_svm))
print("F1 Score: ", f1_score(y_test, y_pred_svm))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred_svm))

SVM Classifier
Accuracy:  0.9912280701754386
Precision:  1.0
Recall:  0.9861111111111112
F1 Score:  0.993006993006993
Confusion matrix:  [[42  0]
 [ 1 71]]


## KFold Cross Validation

### Do the learning process

In [39]:
kfold = cross_validate(clf, x_train, y_train, cv=10, scoring=('accuracy', 'precision', 'recall', 'f1'))

### Result of prediction

In [38]:
print("Decision Tree Classifier Base (from before)")

print("Accuracy: ", accuracy_score(y_test, y_pred_tree))
print("Precision: ", precision_score(y_test, y_pred_tree))
print("Recall: ", recall_score(y_test, y_pred_tree))
print("F1 Score: ", f1_score(y_test, y_pred_tree))

print("Decision Tree Classifier KFold (from now)")
print("Accuracy: ", kfold['test_accuracy'].mean())
print("Precision: ", kfold['test_precision'].mean())
print("Recall: ", kfold['test_recall'].mean())
print("F1 Score: ", kfold['test_f1'].mean())

Decision Tree Classifier Base (from before)
Accuracy:  0.9122807017543859
Precision:  0.9696969696969697
Recall:  0.8888888888888888
F1 Score:  0.927536231884058
Decision Tree Classifier KFold (from now)
Accuracy:  0.9493719806763286
Precision:  0.9652535503881443
Recall:  0.9543103448275861
F1 Score:  0.9589031382577289


Dari hasil perbandingan Decision Tree Classifier dengan KFold dan tanpa KFold, terlihat perbedaan yang kecil namun tetap cukup signifikan. Hal ini berarti bahwa model yang sebelumnya dibangkitkan sudah cukup baik apabila dibandingkan dengan versi K-Fold.

K-Fold sendiri seringkali digunakan untuk menaikkan akurasi dari model yang memiliki akurasi yang rendah akibat beberapa hal, misal overfitting.