# CEN426 - Introduction to Machine Learning
---
## Homework 02
The student must use both ID3 and C4.5 to create a decision tree on a sample dataset, then interpret the results.

### Student ID: 2016556017
### Author: Furkan Çetinkaya <ctnky.frkn@gmail.com>

In [1]:
# Import Required Libraries
from chefboost import Chefboost                 # Includes ID3 and C4.5 Decision Trees
from sklearn.metrics import accuracy_score      # For computing accuracy score
from sklearn.model_selection import KFold       # For 2-fold cross validation
import pandas as pd                             # Pandas DataFrame lib
import numpy as np                              # Numpy Numerical lib

## Read the dataset

In [2]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
X = pd.read_csv(data_url, header=None)

## Remove NaN objects in the dataset

In [3]:
# drop rows with missing values, missing = ?
X = X.replace("?", np.nan)
X = X.dropna()
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3


## Add column names

In [4]:
# rename known columns
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
           'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'Decision']
X.columns = columns
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Decision
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3


## Convert numeric elements to categories

In [5]:
# The Class Labels
X['Decision'] = X.Decision.replace([1,2,3,4], 'Yes')
X['Decision'] = X.Decision.replace([0], 'No')
# Other Feature's Labels
for col in X.columns.values[:12]:
    unique_labels = np.unique(X[col])
    for cat in unique_labels:
        X[col] = X[col].replace(cat, 'cat'+str(cat))
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Decision
0,cat63.0,cat1.0,cat1.0,cat145.0,cat233.0,cat1.0,cat2.0,cat150.0,cat0.0,cat2.3,cat3.0,cat0.0,6.0,No
1,cat67.0,cat1.0,cat4.0,cat160.0,cat286.0,cat0.0,cat2.0,cat108.0,cat1.0,cat1.5,cat2.0,cat3.0,3.0,Yes
2,cat67.0,cat1.0,cat4.0,cat120.0,cat229.0,cat0.0,cat2.0,cat129.0,cat1.0,cat2.6,cat2.0,cat2.0,7.0,Yes
3,cat37.0,cat1.0,cat3.0,cat130.0,cat250.0,cat0.0,cat0.0,cat187.0,cat0.0,cat3.5,cat3.0,cat0.0,3.0,No
4,cat41.0,cat0.0,cat2.0,cat130.0,cat204.0,cat0.0,cat2.0,cat172.0,cat0.0,cat1.4,cat1.0,cat0.0,3.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,cat57.0,cat0.0,cat4.0,cat140.0,cat241.0,cat0.0,cat0.0,cat123.0,cat1.0,cat0.2,cat2.0,cat0.0,7.0,Yes
298,cat45.0,cat1.0,cat1.0,cat110.0,cat264.0,cat0.0,cat0.0,cat132.0,cat0.0,cat1.2,cat2.0,cat0.0,7.0,Yes
299,cat68.0,cat1.0,cat4.0,cat144.0,cat193.0,cat1.0,cat0.0,cat141.0,cat0.0,cat3.4,cat2.0,cat2.0,7.0,Yes
300,cat57.0,cat1.0,cat4.0,cat130.0,cat131.0,cat0.0,cat0.0,cat115.0,cat1.0,cat1.2,cat2.0,cat1.0,7.0,Yes


## 2-k Cross Splitting the Dataset and Model Training for ID3 and C4.5

In [6]:
k = 2
kfold = KFold(n_splits=k, random_state=None, shuffle=False)

predlistc45 = []
predlistid3 = []
iter = 1
for train_index , test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    print("Fold %d: Training C4.5\n++++++++++++++++++++++++++++"%iter)
    c45model = Chefboost.fit(X_train.copy(), {'algorithm': 'C4.5'})
    print("\n\nFold %d: Training ID3\n++++++++++++++++++++++++++++"%iter)
    id3model = Chefboost.fit(X_train.copy(), {'algorithm': 'ID3'})
    predid3 = []
    predc45 = []
    for j in range(X_test.shape[0]):
        predc45.append(Chefboost.predict(c45model, X_test.iloc[j]))
        predid3.append(Chefboost.predict(id3model, X_test.iloc[j]))
    predlistc45.append(accuracy_score(predc45, X_test['Decision']))
    predlistid3.append(accuracy_score(predid3, X_test['Decision']))
    print("===================================")
    print("Iter %d: C4.5 DT Accuracy: %.3f  =="%(iter, predlistc45[iter-1]))
    print("Iter %d: ID3  DT Accuracy: %.3f  =="%(iter, predlistid3[iter-1]))
    print("===================================")
    iter = iter+1
print("===========================")
print("Avg C4.5 Accuracy: %.3f =="%(np.sum(predlistc45)/2))
print("Avg ID3  Accuracy: %.3f =="%(np.sum(predlistid3)/2))
print("===========================")

Fold 1: Training C4.5
++++++++++++++++++++++++++++
C4.5  tree is going to be built...
-------------------------
finished in  8.842073440551758  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  100.0 % on  148  instances
Labels:  ['No' 'Yes']
Confusion matrix:  [[78, 0], [0, 70]]
Precision:  100.0 %, Recall:  100.0 %, F1:  100.0 %


Fold 1: Training ID3
++++++++++++++++++++++++++++
ID3  tree is going to be built...
-------------------------
finished in  8.697105169296265  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  100.0 % on  148  instances
Labels:  ['No' 'Yes']
Confusion matrix:  [[78, 0], [0, 70]]
Precision:  100.0 %, Recall:  100.0 %, F1:  100.0 %
Iter 1: C4.5 DT Accuracy: 0.550  ==
Iter 1: ID3  DT Accuracy: 0.550  ==
Fold 2: Training C4.5
++++++++++++++++++++++++++++
C4.5  tree is going to be built...
-------------------------
finished in  7.9799652099609375  seconds
-------------------------
E