In [20]:
import numpy as np
import pandas as pd
path="/content/play_tennis.csv"
df=pd.read_csv(path)
df

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [21]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = 0
    total_counts = np.sum(counts)
    # Loop to calculate entropy
    for i in range(len(elements)):
        prob = counts[i] / total_counts
        entropy_val += -prob * np.log2(prob)
    return entropy_val
target_entropy = entropy(df['play'])
print(f"Entropy of System: {target_entropy}")



Entropy of System: 0.9402859586706311


In [22]:
def info_gain(data, split_attribute, target_name="play"):
    total_entropy = entropy(data[target_name])
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute]==values[i]).dropna()[target_name]) for i in range(len(values))])
    information_gain = total_entropy - weighted_entropy
    return information_gain

gain = info_gain(df, 'outlook')
print(f"Information Gain of splitting on Outlook: {gain}")

Information Gain of splitting on Outlook: 0.24674981977443933


In [23]:
def entropy(target_col, attribute_name=""):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = np.sum([(-counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])

    if attribute_name:  # Only print if attribute name is provided
        print(f"Entropy of '{attribute_name}': {entropy_val:.4f}")

    return entropy_val

# Function to calculate information gain
def info_gain(data, split_attribute, target_name="play"):
    print(f"\nCalculating Information Gain for attribute: '{split_attribute}'")

    # Calculate the entropy of the entire dataset
    total_entropy = entropy(data[target_name], "System")
    print(f"Total entropy of the dataset for target '{target_name}': {total_entropy:.4f}")

    # Find unique values in the column (split_attribute)
    values, counts = np.unique(data[split_attribute], return_counts=True)

    # Weighted Entropy after splitting
    weighted_entropy = np.sum([
        (counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute] == values[i]).dropna()[target_name], f"{split_attribute} = {values[i]}")
        for i in range(len(values))
    ])

    print(f"Weighted Entropy for '{split_attribute}': {weighted_entropy:.4f}")

    # Information Gain
    information_gain = total_entropy - weighted_entropy
    print(f"Information Gain for attribute '{split_attribute}': {information_gain:.4f}\n")

    return information_gain

# Function to find the attribute with maximum Information Gain
def find_best_attribute(data, features, target_name="play"):
    print("Calculating Information Gain for each attribute...\n")
    gains = {feature: info_gain(data, feature, target_name) for feature in features}
    best_feature = max(gains, key=gains.get)
    print(f"Best attribute to split on is: '{best_feature}' with Information Gain: {gains[best_feature]:.4f}")
    return best_feature, gains[best_feature]
features = df.columns[:-1]
best_attribute, max_gain = find_best_attribute(df, features)


Calculating Information Gain for each attribute...


Calculating Information Gain for attribute: 'outlook'
Entropy of 'System': 0.9403
Total entropy of the dataset for target 'play': 0.9403
Entropy of 'outlook = Overcast': 0.0000
Entropy of 'outlook = Rain': 0.9710
Entropy of 'outlook = Sunny': 0.9710
Weighted Entropy for 'outlook': 0.6935
Information Gain for attribute 'outlook': 0.2467


Calculating Information Gain for attribute: 'temp'
Entropy of 'System': 0.9403
Total entropy of the dataset for target 'play': 0.9403
Entropy of 'temp = Cool': 0.8113
Entropy of 'temp = Hot': 1.0000
Entropy of 'temp = Mild': 0.9183
Weighted Entropy for 'temp': 0.9111
Information Gain for attribute 'temp': 0.0292


Calculating Information Gain for attribute: 'humidity'
Entropy of 'System': 0.9403
Total entropy of the dataset for target 'play': 0.9403
Entropy of 'humidity = High': 0.9852
Entropy of 'humidity = Normal': 0.5917
Weighted Entropy for 'humidity': 0.7885
Information Gain for attribute 'humid

In [24]:
class Node:
    def __init__(self, name=None, children=None, is_leaf=False, classification=None):
        self.name = name
        self.children = children if children is not None else {}
        self.is_leaf = is_leaf
        self.classification = classification

    def __repr__(self):
        if self.is_leaf:
            return f"Leaf Node: {self.classification}"
        return f"Decision Node: {self.name} -> {self.children}"


In [25]:
def build_tree(data, features, target_name="play"):
    # If all target values are the same, return a leaf node with that classification
    if len(np.unique(data[target_name])) == 1:
        classification = np.unique(data[target_name])[0]
        return Node(is_leaf=True, classification=classification)

    # If there are no more features to split on, return a leaf node with the majority class
    if len(features) == 0:
        classification = data[target_name].mode()[0]
        return Node(is_leaf=True, classification=classification)

    # Find the best feature to split on
    best_attribute, _ = find_best_attribute(data, features, target_name)

    # Create a root node with this best attribute
    root = Node(name=best_attribute)

    # Remove the best attribute from the feature list for further splits
    features = [f for f in features if f != best_attribute]

    # Split the data by the values of the best attribute and create child nodes
    for value in np.unique(data[best_attribute]):
        subset = data[data[best_attribute] == value]
        child = build_tree(subset, features, target_name)
        root.children[value] = child

    return root


In [26]:
def predict(tree, instance):
    if tree.is_leaf:
        return tree.classification
    attribute_value = instance[tree.name]
    return predict(tree.children[attribute_value], instance)
# Get the features (all columns except the target)
features = df.columns[:-1]

# Build the decision tree
decision_tree = build_tree(df, features)

# Print the decision tree structure
print("Decision Tree Structure:")
print(decision_tree)

# Test with a sample instance
test_instance = df.iloc[0]  # First instance from the dataset
prediction = predict(decision_tree, test_instance)
print(f"Prediction for test instance {test_instance.to_dict()}: {prediction}")


Calculating Information Gain for each attribute...


Calculating Information Gain for attribute: 'outlook'
Entropy of 'System': 0.9403
Total entropy of the dataset for target 'play': 0.9403
Entropy of 'outlook = Overcast': 0.0000
Entropy of 'outlook = Rain': 0.9710
Entropy of 'outlook = Sunny': 0.9710
Weighted Entropy for 'outlook': 0.6935
Information Gain for attribute 'outlook': 0.2467


Calculating Information Gain for attribute: 'temp'
Entropy of 'System': 0.9403
Total entropy of the dataset for target 'play': 0.9403
Entropy of 'temp = Cool': 0.8113
Entropy of 'temp = Hot': 1.0000
Entropy of 'temp = Mild': 0.9183
Weighted Entropy for 'temp': 0.9111
Information Gain for attribute 'temp': 0.0292


Calculating Information Gain for attribute: 'humidity'
Entropy of 'System': 0.9403
Total entropy of the dataset for target 'play': 0.9403
Entropy of 'humidity = High': 0.9852
Entropy of 'humidity = Normal': 0.5917
Weighted Entropy for 'humidity': 0.7885
Information Gain for attribute 'humid