In [1]:
import numpy as np
import pandas as pd

In [2]:
def entropy(column):
    values, counts = np.unique(column, return_counts=True)
    probabilities = counts / np.sum(counts)
    entropy_val = -np.sum(probabilities * np.log2(probabilities))
    return entropy_val


In [3]:
data = pd.read_csv("exp5.csv")

In [4]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rainy,Mild,High,Weak,Yes
4,Rainy,Cool,Normal,Weak,Yes
5,Rainy,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rainy,Mild,Normal,Weak,Yes


In [5]:
# Calculate entropy for the target variable (PlayTennis)
target_entropy = entropy(data['PlayTennis'])
print("Entropy of target variable (PlayTennis):", target_entropy)

Entropy of target variable (PlayTennis): 0.9402859586706311


In [6]:
def conditional_entropy(data, attribute, target_attribute):
    # Calculate the conditional entropy of an attribute given the target attribute
    conditional_entropy_val = 0
    for value in data[attribute].unique():
        subset = data[data[attribute] == value]
        subset_entropy = entropy(subset[target_attribute])
        conditional_probability = len(subset) / len(data)
        conditional_entropy_val += conditional_probability * subset_entropy
        print("Conditional entropy of", attribute, "|", target_attribute, "=", value, ":", subset_entropy)
    return conditional_entropy_val

def information_gain(data, attribute, target_attribute):
    # Calculate the information gain of an attribute
    attribute_entropy = entropy(data[target_attribute])
    conditional_entropy_val = conditional_entropy(data, attribute, target_attribute)
    information_gain_val = attribute_entropy - conditional_entropy_val
    return information_gain_val

In [7]:
# Calculate information gain for all predictor attributes
target_attribute = 'PlayTennis'
information_gains = {}
for column in data.columns[:-1]:  # Exclude the last column (target variable)
    information_gain_val = information_gain(data, column, target_attribute)
    information_gains[column] = information_gain_val
    print("Information gain of", column, ":", information_gain_val)

Conditional entropy of Outlook | PlayTennis = Sunny : 0.9709505944546686
Conditional entropy of Outlook | PlayTennis = Overcast : -0.0
Conditional entropy of Outlook | PlayTennis = Rainy : 0.9709505944546686
Information gain of Outlook : 0.24674981977443933
Conditional entropy of Temperature | PlayTennis = Hot : 1.0
Conditional entropy of Temperature | PlayTennis = Mild : 0.9182958340544896
Conditional entropy of Temperature | PlayTennis = Cool : 0.8112781244591328
Information gain of Temperature : 0.02922256565895487
Conditional entropy of Humidity | PlayTennis = High : 0.9852281360342515
Conditional entropy of Humidity | PlayTennis = Normal : 0.5916727785823275
Information gain of Humidity : 0.15183550136234159
Conditional entropy of Wind | PlayTennis = Weak : 0.8112781244591328
Conditional entropy of Wind | PlayTennis = Strong : 1.0
Information gain of Wind : 0.04812703040826949


In [8]:
root_attribute = max(data.columns[:-1], key=lambda col: information_gain(data, col, target_attribute))
print("Root Node Attribute:", root_attribute)


Conditional entropy of Outlook | PlayTennis = Sunny : 0.9709505944546686
Conditional entropy of Outlook | PlayTennis = Overcast : -0.0
Conditional entropy of Outlook | PlayTennis = Rainy : 0.9709505944546686
Conditional entropy of Temperature | PlayTennis = Hot : 1.0
Conditional entropy of Temperature | PlayTennis = Mild : 0.9182958340544896
Conditional entropy of Temperature | PlayTennis = Cool : 0.8112781244591328
Conditional entropy of Humidity | PlayTennis = High : 0.9852281360342515
Conditional entropy of Humidity | PlayTennis = Normal : 0.5916727785823275
Conditional entropy of Wind | PlayTennis = Weak : 0.8112781244591328
Conditional entropy of Wind | PlayTennis = Strong : 1.0
Root Node Attribute: Outlook


In [15]:
def decision_tree(data, target_attribute):
    # Recursive function to build the decision tree
    root_node = {}
    
    # Base case: if all instances have the same target value, return the target value
    if len(data[target_attribute].unique()) == 1:
        return data[target_attribute].iloc[0]
    
    # Find the attribute with the highest information gain
    best_attribute = max(data.columns[:-1], key=lambda col: information_gain(data, col, target_attribute))
    root_node['attribute'] = best_attribute
    root_node['branches'] = {}
    
    # Split the dataset based on the chosen attribute
    for value in data[best_attribute].unique():
        subset = data[data[best_attribute] == value]
        root_node['branches'][value] = decision_tree(subset.drop(columns=[best_attribute]), target_attribute)
    
    return root_node

def print_decision_tree(decision_tree, indent=''):
    if 'attribute' in decision_tree:
        print(indent + decision_tree['attribute'])
        for value, subtree in decision_tree['branches'].items():
            print(indent + '  ' + value + ':')
            print_decision_tree(subtree, indent + '    ')
    else:
        print(indent + decision_tree)

In [17]:

# Print all information gains
print("Information Gain for each attribute:")
for column in data.columns[:-1]:
    ig = information_gain(data, column, target_attribute)
    print(f"{column}: {ig}")
# Iterate over each unique value of the root node attribute
for root_node_value in data[root_attribute].unique():
    # Reduce the dataset based on the root node attribute value
    reduced_data = data[data[root_attribute] == root_node_value]
    
    # Build the decision tree for the reduced dataset
    decision_tree_root = decision_tree(reduced_data, target_attribute)
    
    # Print the decision tree for the current node value of the root attribute
    print(f"Decision Tree for {root_attribute} = {root_node_value}:")
    print_decision_tree(decision_tree_root)
    print()
    
    

Information Gain for each attribute:
Conditional entropy of Outlook | PlayTennis = Sunny : 0.9709505944546686
Conditional entropy of Outlook | PlayTennis = Overcast : -0.0
Conditional entropy of Outlook | PlayTennis = Rainy : 0.9709505944546686
Outlook: 0.24674981977443933
Conditional entropy of Temperature | PlayTennis = Hot : 1.0
Conditional entropy of Temperature | PlayTennis = Mild : 0.9182958340544896
Conditional entropy of Temperature | PlayTennis = Cool : 0.8112781244591328
Temperature: 0.02922256565895487
Conditional entropy of Humidity | PlayTennis = High : 0.9852281360342515
Conditional entropy of Humidity | PlayTennis = Normal : 0.5916727785823275
Humidity: 0.15183550136234159
Conditional entropy of Wind | PlayTennis = Weak : 0.8112781244591328
Conditional entropy of Wind | PlayTennis = Strong : 1.0
Wind: 0.04812703040826949
Conditional entropy of Outlook | PlayTennis = Sunny : 0.9709505944546686
Conditional entropy of Temperature | PlayTennis = Hot : -0.0
Conditional entrop