# ID3, ID4.5 and Gini Index Calculation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import math

In [2]:
#read excel file tree_example.csv, delete the first column
df=pd.read_excel('tree_example.xlsx').iloc[:,1:]
df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


## Calculate ID 3

In [3]:
#calculate the entropy of the parent node using math
parent_entropy=-(5/14)* math.log(5/14,2)-(9/14)*math.log(9/14,2)
Outlook_entropy=5/14*(-(2/5)*math.log(2/5,2)-(3/5)*math.log(3/5,2))+5/14*(-(3/5)*math.log(3/5,2)-(2/5)*math.log(2/5,2))
Outlook_infogain=parent_entropy-Outlook_entropy
Outlook_Sunny_entropy=-(2/5)*math.log(2/5,2)-(3/5)*math.log(3/5,2)
Outlook_Sunny_Temp_gain=Outlook_Sunny_entropy-0-2/5*1-0


print('parent_entropy:',parent_entropy)
print('Outlook_entropy:',Outlook_entropy)
print('Outlook_infogain:',Outlook_infogain)
print('Outlook_Sunny_entropy:',Outlook_Sunny_entropy)
print('Outlook_Sunny_Temp_gain:',Outlook_Sunny_Temp_gain)


parent_entropy: 0.9402859586706309
Outlook_entropy: 0.6935361388961918
Outlook_infogain: 0.2467498197744391
Outlook_Sunny_entropy: 0.9709505944546686
Outlook_Sunny_Temp_gain: 0.5709505944546686


### Build Function

In [4]:
# Automatically set the target attribute to the last column
target_attribute = df.columns[-1]

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data, split_attribute_name, target_name):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

# Step 1: Calculate Information Gain for each attribute
info_gains = {col: InfoGain(df, col, target_attribute) for col in df.columns if col != target_attribute}
print("Information Gain for each attribute:")
print(info_gains)

# Step 2: Choose the attribute with the highest Information Gain
best_attribute = max(info_gains, key=info_gains.get)
print(f"\nBest attribute for first split: {best_attribute}: {info_gains[best_attribute]:.3f}")

# Step 3: Split the dataset and calculate Information Gain for the second layer
def split_dataset(df, attribute):
    return {value: df[df[attribute] == value] for value in df[attribute].unique()}

subsets = split_dataset(df, best_attribute)
info_gains_subsets = {subset_key: {col: InfoGain(subset, col, target_attribute) for col in subset.columns if col != target_attribute and col != best_attribute} for subset_key, subset in subsets.items()}
print("\nInformation Gain for second layer splits:")
for subset_key, gains in info_gains_subsets.items():
    print(f"{subset_key}: {gains}")

# Step 4: Choose the gain>0 across all subsets
best_attribute_subsets = {subset_key: max(gains, key=gains.get) for subset_key, gains in info_gains_subsets.items() if max(gains.values()) > 0}
print(f"\nBest attribute for third split:")
for subset_key, best_attribute in best_attribute_subsets.items():
    print(f"{subset_key}: {best_attribute} {info_gains_subsets[subset_key][best_attribute]:.3f}")


Information Gain for each attribute:
{'Outlook': 0.24674981977443933, 'Temperature': 0.02922256565895487, 'Humidity': 0.15183550136234159, 'Wind': 0.04812703040826949}

Best attribute for first split: Outlook: 0.247

Information Gain for second layer splits:
Sunny: {'Temperature': 0.5709505944546686, 'Humidity': 0.9709505944546686, 'Wind': 0.01997309402197489}
Overcast: {'Temperature': 0.0, 'Humidity': 0.0, 'Wind': 0.0}
Rain: {'Temperature': 0.01997309402197489, 'Humidity': 0.01997309402197489, 'Wind': 0.9709505944546686}

Best attribute for third split:
Sunny: Humidity 0.971
Rain: Wind 0.971


## Calculate ID 4.5

### Build Function

In [7]:
# Automatically set the target attribute to the last column
target_attribute = df.columns[-1]

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def GainRatio(data, split_attribute_name, target_name):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    SplitInfo = entropy(data[split_attribute_name])
    Ratio = Information_Gain / SplitInfo
    return Ratio

# Step 1: Calculate Information Gain for each attribute
gain_ratios = {col: GainRatio(df, col, target_attribute) for col in df.columns if col != target_attribute}
print("GainRatio for each attribute:")
print(gain_ratios)

# Step 2: Choose the attribute with the highest Information Gain
best_attribute = max(gain_ratios, key=gain_ratios.get)
print(f"\nBest attribute for first split: {best_attribute}: {gain_ratios[best_attribute]:.3f}")

# Step 3: Split the dataset and calculate Information Gain for the second layer
def split_dataset(df, attribute):
    return {value: df[df[attribute] == value] for value in df[attribute].unique()}

subsets = split_dataset(df, best_attribute)
info_gains_subsets = {subset_key: {col: GainRatio(subset, col, target_attribute) for col in subset.columns if col != target_attribute and col != best_attribute} for subset_key, subset in subsets.items()}
print("\nGainRatio for second layer splits:")
for subset_key, gains in info_gains_subsets.items():
    print(f"{subset_key}: {gains}")

# Step 4: Choose the gain>0 across all subsets
best_attribute_subsets = {subset_key: max(gains, key=gains.get) for subset_key, gains in info_gains_subsets.items() if max(gains.values()) > 0}
print(f"\nBest attribute for third split:")
for subset_key, best_attribute in best_attribute_subsets.items():
    print(f"{subset_key}: {best_attribute} {info_gains_subsets[subset_key][best_attribute]}")


GainRatio for each attribute:
{'Outlook': 0.15642756242117528, 'Temperature': 0.018772646222418813, 'Humidity': 0.15183550136234159, 'Wind': 0.048848615511520824}

Best attribute for first split: Outlook: 0.156

GainRatio for second layer splits:
Sunny: {'Temperature': 0.37514952012034747, 'Humidity': 1.0, 'Wind': 0.020570659450692974}
Overcast: {'Temperature': 0.0, 'Humidity': 0.0, 'Wind': 0.0}
Rain: {'Temperature': 0.020570659450692974, 'Humidity': 0.020570659450692974, 'Wind': 1.0}

Best attribute for third split:
Sunny: Humidity 1.0
Rain: Wind 1.0


## Calculate Gini Index

### Build Function

In [8]:
# Automatically set the target attribute to the last column
target_attribute = df.columns[-1]

def gini_index(groups, classes):
    n_instances = float(sum([len(group) for group in groups]))
    gini = 0.0
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = (group.count(class_val) / size)
            score += p * p
        gini += (1.0 - score) * (size / n_instances)
    return gini

def calculate_gini_index(df, target_attribute, exclude_attribute=None):
    gini_scores = {}
    classes = list(df[target_attribute].unique())
    for attribute in df.columns:
        if attribute != target_attribute and attribute != exclude_attribute:
            groups = df.groupby(attribute)[target_attribute].apply(list)
            gini_scores[attribute] = round(gini_index(groups, classes), 3)
    return gini_scores

def split_dataset(df, attribute):
    return {value: df[df[attribute] == value] for value in df[attribute].unique()}

def calculate_gini_index_for_subsets(subsets, target_attribute, exclude_attribute):
    gini_scores = {}
    for subset_key, subset in subsets.items():
        gini_scores_subset = calculate_gini_index(subset, target_attribute, exclude_attribute)
        gini_scores[subset_key] = gini_scores_subset
    return gini_scores

# Step 1: Calculate Gini Index for each attribute
gini_scores = calculate_gini_index(df, target_attribute)
print("Gini scores for each attribute:")
print(gini_scores)

# Step 2: Choose the attribute with the lowest Gini score
best_attribute = min(gini_scores, key=gini_scores.get)
print(f"\nBest attribute for first split: {best_attribute}: {gini_scores[best_attribute]:.3f}")

# Step 3: Split the dataset and calculate Gini Index for the second layer, excluding the best attribute
subsets = split_dataset(df, best_attribute)
gini_scores_subsets = calculate_gini_index_for_subsets(subsets, target_attribute, best_attribute)
print("\nGini scores for second layer splits:")
for subset_key, scores in gini_scores_subsets.items():
    print(f"{subset_key}: {scores}")

# Step 4: Choose the min giniscore and make sure no further split
best_attribute_subsets = {subset_key: min(gini_scores, key=gini_scores.get) for subset_key, gini_scores in gini_scores_subsets.items() if any(score != 0 for score in gini_scores.values())}
print(f"\nBest attribute for third split:")
for subset_key, best_attribute in best_attribute_subsets.items():
    print(f"{subset_key}: {best_attribute} {gini_scores_subsets[subset_key][best_attribute]}")


Gini scores for each attribute:
{'Outlook': 0.343, 'Temperature': 0.44, 'Humidity': 0.367, 'Wind': 0.429}

Best attribute for first split: Outlook: 0.343

Gini scores for second layer splits:
Sunny: {'Temperature': 0.2, 'Humidity': 0.0, 'Wind': 0.467}
Overcast: {'Temperature': 0.0, 'Humidity': 0.0, 'Wind': 0.0}
Rain: {'Temperature': 0.467, 'Humidity': 0.467, 'Wind': 0.0}

Best attribute for third split:
Sunny: Humidity 0.0
Rain: Wind 0.0


ID3, ID4.5 and Gini Index choose the same attributes.