Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Use an appropriate data set for building the decision tree and apply this knowledge to
classify a new sample.

In [28]:
import pandas as pd
import numpy as np
import math

In [29]:
# Function to calculate the entropy of a dataset
def calculate_entropy(data):
    total_records = len(data)
    value_counts = data.value_counts()
    
    entropy = 0
    for count in value_counts:
        probability = count / total_records
        entropy -= probability * math.log2(probability)
    
    return entropy

In [30]:
# Function to calculate information gain for a given feature
def calculate_information_gain(data, feature, target):
    total_entropy = calculate_entropy(data[target])
    
    # Split the data by unique values of the feature
    feature_values = data[feature].unique()
    weighted_entropy = 0
    
    for value in feature_values:
        subset = data[data[feature] == value]
        weighted_entropy += (len(subset) / len(data)) * calculate_entropy(subset[target])
    
    information_gain = total_entropy - weighted_entropy
    return information_gain

In [31]:
# Function to choose the best feature based on the highest information gain
def choose_best_feature(data, target, features):
    best_feature = None
    best_info_gain = -float('inf')
    
    for feature in features:
        info_gain = calculate_information_gain(data, feature, target)
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature
    
    return best_feature

In [32]:
# Function to build the decision tree using ID3 algorithm
def id3(data, target, features):
    # Base case: if all examples have the same class
    if len(data[target].unique()) == 1:
        return data[target].iloc[0]
    
    # Base case: if there are no features left to split on
    if len(features) == 0:
        return data[target].mode()[0]
    
    # Choose the best feature based on information gain
    best_feature = choose_best_feature(data, target, features)
    
    # Create a decision tree node with the best feature
    tree = {best_feature: {}}
    
    # Split the dataset based on the best feature
    remaining_features = [f for f in features if f != best_feature]
    
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        subtree = id3(subset, target, remaining_features)
        tree[best_feature][value] = subtree
    
    return tree

In [33]:
# Function to classify a new sample using the decision tree
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree
    
    # Get the feature at the top of the tree
    feature = list(tree.keys())[0]
    feature_value = sample[feature]
    
    # Traverse down the tree based on the feature value
    if feature_value in tree[feature]:
        return classify(tree[feature][feature_value], sample)
    else:
        return None  # In case the feature value is not present in the tree


In [34]:
# Function to print the decision tree in an arrow and line format
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        feature = list(tree.keys())[0]
        print(f"{indent}|")
        print(f"{indent}|-- {feature} ?")
        for value, subtree in tree[feature].items():
            print(f"{indent}|   If {feature} = {value}:")
            print_tree(subtree, indent + "       |")
    else:
        print(f"{indent}|-- {tree}")

In [35]:
data = pd.read_csv("./Data/weather.csv")
data.head(5)

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,False,No
1,Sunny,Hot,High,True,No
2,Sunny,Mild,High,False,Yes
3,Sunny,Cool,High,False,Yes
4,Sunny,Mild,High,False,Yes


In [36]:
# Convert the target variable to a categorical variable
target = 'Play'
features = ['Outlook','Temperature','Humidity','Windy']

# Build the decision tree using ID3 algorithm
tree = id3(data, target, features)

# Print the decision tree
print("Decision Tree:")
print_tree(tree)

# Classify a new sample
new_sample = {
    'Outlook': 'Sunny',
    'Temperature': 'Mild',
    'Humidity': 'High',
    'Windy': True
}

# Convert the sample to a pandas Series for easy indexing
new_sample = pd.Series(new_sample)

# Classify the new sample using the decision tree
result = classify(tree, new_sample)
print("\nClassification result for the new sample:", result)


Decision Tree:
|
|-- Temperature ?
|   If Temperature = Hot:
       ||-- No
|   If Temperature = Mild:
       ||
       ||-- Outlook ?
       ||   If Outlook = Sunny:
       |       ||-- Yes
       ||   If Outlook = Overcast:
       |       ||-- No
|   If Temperature = Cool:
       ||
       ||-- Outlook ?
       ||   If Outlook = Sunny:
       |       ||
       |       ||-- Windy ?
       |       ||   If Windy = False:
       |       |       ||-- Yes
       |       ||   If Windy = True:
       |       |       ||-- No
       ||   If Outlook = Rain:
       |       ||-- No

Classification result for the new sample: Yes
