In [3]:
import pandas as pd

class Node:
    def __init__(self, attribute=None, value=None, result=None):
        self.attribute = attribute  # Attribute to split on
        self.value = value  # Value of the attribute
        self.result = result  # Result if this is a leaf node
        self.children = {}  # Dictionary to store child nodes

def calculate_gini_index(data, attribute, target):
    gini_index = 0.0
    values = data[attribute].unique()
    
    # Calculate Gini index for each value of the attribute
    for value in values:
        subset = data[data[attribute] == value]
        prob = len(subset) / len(data)
        
        # Calculate the probability of each class in the subset
        class_prob = subset[target].value_counts() / len(subset)
        
        # Calculate the Gini index for the subset
        gini = 1 - sum(class_prob ** 2)
        
        # Weighted sum of Gini index
        gini_index += prob * gini
    
    return gini_index

def build_tree(data, max_depth, depth=0):
    # Check if data is pure or max depth is reached
    if len(data['PlayTennis'].unique()) == 1 or depth == max_depth:
        return Node(result=data['PlayTennis'].iloc[0])

    # Get attributes and calculate Gini index for each
    attributes = data.columns[:-1]
    gini_indices = {}
    for attribute in attributes:
        gini_index = calculate_gini_index(data, attribute, 'PlayTennis')
        gini_indices[attribute] = gini_index

    # Choose attribute with lowest Gini index
    best_split_attribute = min(gini_indices, key=gini_indices.get)
    node = Node(attribute=best_split_attribute)

    # Split data based on chosen attribute
    for value in data[best_split_attribute].unique():
        subset = data[data[best_split_attribute] == value]
        node.children[value] = build_tree(subset.drop(columns=[best_split_attribute]), max_depth, depth+1)

    return node

def print_tree(node, depth=0):
    if node.result is not None:
        print(f"{'  '*depth}Result: {node.result}")
    else:
        print(f"{'  '*depth}{node.attribute}:")
        for value, child_node in node.children.items():
            print(f"{'  '*(depth+1)}{value}")
            print_tree(child_node, depth+2)

# Load the dataset
data = pd.read_csv('exp5.csv')

# Build the decision tree iteratively
max_depth = 4
for i in range(max_depth):
    print(f"Iteration {i+1}:")
    
    # Calculate Gini index for each attribute
    attributes = data.columns[:-1]  # Exclude the target variable
    gini_indices = {}
    for attribute in attributes:
        gini_index = calculate_gini_index(data, attribute, 'PlayTennis')
        gini_indices[attribute] = gini_index

    # Print Gini index for each attribute
    for attribute, gini_index in gini_indices.items():
        print(f"Gini index for {attribute}: {gini_index:.3f}")

    # Build decision tree
    root_node = build_tree(data, max_depth=i+1)

    # Print decision tree
    print("Decision Tree:")
    print_tree(root_node)
    print()

    # Reduce dataset based on the tree
    current_node = root_node
    while current_node.children:
        attribute = current_node.attribute
        value = next(iter(current_node.children))
        data = data[data[attribute] == value]
        current_node = current_node.children[value]

    if len(data['PlayTennis'].unique()) == 1:
        print(f"Reached pure leaf node. Stopping iterations.")
        break

    # Print reduced dataset
    print("Reduced Dataset:")
    print(data)
    print()


Iteration 1:
Gini index for Outlook: 0.343
Gini index for Temperature: 0.440
Gini index for Humidity: 0.367
Gini index for Wind: 0.429
Decision Tree:
Outlook:
  Sunny
    Result: No
  Overcast
    Result: Yes
  Rainy
    Result: Yes

Reduced Dataset:
   Outlook Temperature Humidity    Wind PlayTennis
0    Sunny         Hot     High    Weak         No
1    Sunny         Hot     High  Strong         No
7    Sunny        Mild     High    Weak         No
8    Sunny        Cool   Normal    Weak        Yes
10   Sunny        Mild   Normal  Strong        Yes

Iteration 2:
Gini index for Outlook: 0.480
Gini index for Temperature: 0.200
Gini index for Humidity: 0.000
Gini index for Wind: 0.467
Decision Tree:
Humidity:
  High
    Result: No
  Normal
    Result: Yes

Reached pure leaf node. Stopping iterations.
