### import Pandas, Numpy

In [1]:
import pandas as pd
import numpy as np 

### Import the dataset

In [5]:
data = pd.read_csv("heart.csv", encoding='cp1252')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [7]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    target_value = 0
    for i in range(len(elements)):
        prob = counts[i] / np.sum(counts)
        target_value += -prob * np.log2(prob)
    return target_value
print("Entropy of target:", entropy(data['target']))

Entropy of target: 0.9994994187527655


### Define function to Calculate Information Gain

In [10]:
def information_gain(data, split_attribute, target):
    total_entropy = entropy(data[target])
    
    vals, counts = np.unique(data[split_attribute], return_counts=True)
    
    weighted_entropy = 0
    for i in range(len(vals)):
        subset = data[data[split_attribute] == vals[i]]
        prob = counts[i] / np.sum(counts)
        weighted_entropy += prob * entropy(subset[target])
    
    info_gain = total_entropy - weighted_entropy
    return info_gain
print("Information Gain for 'sex':", information_gain(data, "sex", "target"))
print("Information Gain for 'cp':", information_gain(data, "cp", "target"))

Information Gain for 'sex': 0.057984148016857584
Information Gain for 'cp': 0.20848408129015084


### Implement ID3 Algo

In [11]:
def id3(data, features, target):
    if len(np.unique(data[target])) == 1:
        return np.unique(data[target])[0]
    
    if len(features) == 0:
        return np.unique(data[target])[np.argmax(
            np.unique(data[target], return_counts=True)[1]
        )]
    
    gains = [information_gain(data, f, target) for f in features]
    best_feature = features[np.argmax(gains)]
    
    tree = {best_feature: {}}
    
    for value in np.unique(data[best_feature]):
        subset = data[data[best_feature] == value]
        
        if subset.shape[0] == 0:
            majority_class = np.unique(data[target])[np.argmax(
                np.unique(data[target], return_counts=True)[1]
            )]
            tree[best_feature][value] = majority_class
        else:
            new_features = [f for f in features if f != best_feature]
            subtree = id3(subset, new_features, target)
            tree[best_feature][value] = subtree
    
    return tree


### Use ID3

In [16]:
features = list(data.columns[:-1])   
target = "target"                   

tree = id3(data, features, target)

### Print Tree

In [17]:
print(tree)

{'chol': {126: 1, 131: 0, 141: 1, 149: {'age': {49: 0, 71: 1}}, 157: 1, 160: 1, 164: 0, 166: 0, 167: 0, 168: 1, 169: 0, 172: 0, 174: 0, 175: 1, 176: 0, 177: {'age': {43: 0, 46: 1, 59: 0, 65: 1}}, 178: 1, 180: 1, 182: 1, 183: 1, 184: 0, 185: 0, 186: 1, 187: 0, 188: 0, 192: 1, 193: {'age': {56: 1, 68: 0}}, 195: 1, 196: 1, 197: {'age': {44: 0, 46: 1, 53: 1, 58: 1, 63: 0, 76: 1}}, 198: {'age': {35: 0, 41: 1}}, 199: 1, 200: 0, 201: 1, 203: {'age': {41: 1, 53: 0, 61: 0}}, 204: {'age': {29: 1, 41: 1, 46: 1, 47: 1, 52: 0, 59: 0}}, 205: {'age': {52: 1, 55: 0}}, 206: 0, 207: {'age': {57: 1, 61: 0}}, 208: 1, 209: 1, 210: 1, 211: 1, 212: {'age': {52: 0, 59: 1, 64: 0, 66: 0, 67: 0}}, 213: 1, 214: 1, 215: 1, 216: {'age': {53: 1, 58: 0}}, 217: 0, 218: 0, 219: {'age': {39: 0, 44: 1, 50: 1}}, 220: 1, 221: 1, 222: 1, 223: {'age': {40: 0, 52: 1, 67: 1}}, 224: 0, 225: 0, 226: 1, 227: 1, 228: {'sex': {0: 0, 1: 1}}, 229: 0, 230: 0, 231: {'age': {38: 0, 46: 0, 62: 1}}, 232: {'age': {54: 1, 57: 0}}, 233: {'ag