In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("Experiment3.csv")

X = df.drop(columns=['id', 'diagnosis', 'Unnamed: 32'])
y = df['diagnosis'].map({'M': 1, 'B': 0})

def id3(X, y):
    if len(np.unique(y)) == 1:
        return y.iloc[0]
    if len(X.columns) == 0:
        return y.mode()[0]
    info_gain_ratio = []
    for feature in X.columns:
        # calculate entropy for each value of the feature
        entropy = 0
        for value in np.unique(X[feature]):
            sub_y = y[X[feature] == value]
            prob = len(sub_y) / len(y)
            if prob > 0:
                entropy += prob * np.log2(prob)
        entropy = -entropy
        
        # calculate information gain
        info_gain = entropy
        for value in np.unique(X[feature]):
            sub_y = y[X[feature] == value]
            prob = len(sub_y) / len(y)
            sub_entropy = 0
            for sub_value in np.unique(sub_y):
                sub_prob = len(sub_y[sub_y == sub_value]) / len(sub_y)
                sub_entropy += sub_prob * np.log2(sub_prob)
            sub_entropy = -sub_entropy
            info_gain -= prob * sub_entropy
        
        # calculate information gain ratio
        split_info = 0
        for value in np.unique(X[feature]):
            sub_y = y[X[feature] == value]
            prob = len(sub_y) / len(y)
            if prob > 0:
                split_info -= prob * np.log2(prob)
        if split_info > 0:
            info_gain_ratio.append((feature, info_gain / split_info))
    
    if len(info_gain_ratio) == 0:
        return y.mode()[0]
    
    best_feature = max(info_gain_ratio, key=lambda x: x[1])[0]
    
    # create the tree
    tree = {best_feature: {}}
    for value in np.unique(X[best_feature]):
        sub_X = X[X[best_feature] == value].drop(columns=[best_feature])
        sub_y = y[X[best_feature] == value]
        tree[best_feature][value] = id3(sub_X, sub_y)
    return tree

tree = id3(X, y)
print(tree)


{'concave points_mean': {0.0: 0, 0.001852: 0, 0.002404: 0, 0.002924: 0, 0.002941: 0, 0.003125: 0, 0.003261: 0, 0.003333: 0, 0.003472: 0, 0.004167: 0, 0.004419: 0, 0.005051: 0, 0.005128: 0, 0.005159: 0, 0.005449: 0, 0.005495: 0, 0.005592: 0, 0.005664: 0, 0.005769: 0, 0.005917: 0, 0.00625: 0, 0.006423: 0, 0.006434: 0, 0.006588: 0, 0.007246: 0, 0.007583: 0, 0.007799: 0, 0.007875: 0, 0.007937: 0, 0.00816: 0, 0.008488: 0, 0.008507: 0, 0.008535: 0, 0.008829: 0, 0.008907: 0, 0.009259: 0, 0.009615: 0, 0.009937: 0, 0.01043: 0, 0.01071: 0, 0.01076: 0, 0.01105: 0, 0.01108: 0, 0.01111: 0, 0.01115: 0, 0.01116: 0, 0.01117: 0, 0.01141: 0, 0.01148: 0, 0.01149: 0, 0.01162: 0, 0.01171: 0, 0.01201: 0, 0.01216: 0, 0.01238: 0, 0.01256: 0, 0.01257: 0, 0.01261: 0, 0.01282: 0, 0.0129: 0, 0.01313: 0, 0.01329: 0, 0.0133: 0, 0.01339: 0, 0.01349: 0, 0.01364: 0, 0.01369: 0, 0.0137: 0, 0.01374: 0, 0.01393: 0, 0.01404: 0, 0.01406: 0, 0.01407: 0, 0.01428: 0, 0.01471: 0, 0.01473: 0, 0.01499: 0, 0.01502: 0, 0.01504: 0,