In [2]:
import pandas as pd
import numpy as np

# Read the data
id3_dataset = pd.read_csv('prof.csv')
id3_dataset
# Calculate the entropy of the target 'Worth Taking'
def entropy(target):
    elements, counts = np.unique(target['Worth Taking'], return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))for i in range(len(elements))])
    return entropy

def information_gain(target, col):
    H = entropy(target)
    elements = np.unique(target[col])
    total = []
    for el in elements:
        class_df = target[target[col] == el]
        weight = len(class_df)/len(target)
        total.append(weight*entropy(class_df))
    IG = H - sum(total)
    return IG

def is_done(df):
    # Check if all data is in the same class
    if len(df['Worth Taking'].unique()) == 1:
        print(f'make leaf node with class value as output: {df["Worth Taking"].iloc[0]}')
    # Check if all data has the same attributes in all columns
    elif all(df.nunique() == 1):
        majority = df['Worth Taking'].value_counts().idxmax()
        print(f'make leaf node with majority of class values in Y as output: {majority}')
    else:
        print('okay')

## First split

In [3]:
[
information_gain(id3_dataset, 'Personality'), 
information_gain(id3_dataset, 'Difficulty'),
information_gain(id3_dataset, 'RMP Reviews'),
information_gain(id3_dataset, 'Easy A')
]

[0.04812703040826949,
 0.24674981977443933,
 0.02922256565895487,
 0.15183550136234159]

RMP Reviews wins

In [4]:
d_difficulty_low = id3_dataset[id3_dataset['Difficulty'] == 'Low']
is_done(d_difficulty_low)
d_difficulty_med = id3_dataset[id3_dataset['Difficulty'] == 'Medium']
is_done(d_difficulty_med)
d_difficulty_high = id3_dataset[id3_dataset['Difficulty'] == 'High']
is_done(d_difficulty_high)

okay
okay
make leaf node with class value as output: -


In [5]:
[
information_gain(d_difficulty_low, 'Personality'), 
information_gain(d_difficulty_low, 'RMP Reviews'),
information_gain(d_difficulty_low, 'Easy A')
]

[0.9709505944546686, 0.01997309402197489, 0.01997309402197489]

In [6]:
d_difficulty_low_personality_hilarious = d_difficulty_low[d_difficulty_low['Personality'] == 'Hilarious']
is_done(d_difficulty_low_personality_hilarious)
d_difficulty_low_personality_boring = d_difficulty_low[d_difficulty_low['Personality'] == 'Boring']
is_done(d_difficulty_low_personality_boring)

make leaf node with class value as output: -
make leaf node with class value as output: +


In [7]:
[
information_gain(d_difficulty_med, 'Personality'), 
information_gain(d_difficulty_med, 'RMP Reviews'),
information_gain(d_difficulty_med, 'Easy A')
]

[0.01997309402197489, 0.5709505944546686, 0.9709505944546686]

In [8]:
d_difficulty_med_easya_yes = d_difficulty_med[d_difficulty_med['Easy A'] == 'Yes']
is_done(d_difficulty_med_easya_yes)
d_difficulty_med_easya_no = d_difficulty_med[d_difficulty_med['Easy A'] == 'No']
is_done(d_difficulty_med_easya_no)

make leaf node with class value as output: -
make leaf node with class value as output: +


In [11]:
# Read the uploaded data
df = id3_dataset

# Function to create ASCII representation of the decision tree
def ascii_tree(df, depth=0, prefix=""):
    # Base Cases
    if len(df['Worth Taking'].unique()) == 1:
        return f"{prefix}Leaf: {df['Worth Taking'].iloc[0]}\n"
    elif all(df.nunique() == 1):
        majority = df['Worth Taking'].value_counts().idxmax()
        return f"{prefix}Leaf: {majority}\n"
    
    # Get the attribute with max information gain
    gains = {col: information_gain(df, col) for col in df.columns if col != 'Worth Taking'}
    best_attr = max(gains, key=gains.get)
    
    result = f"{prefix}{best_attr}\n"
    
    # Split the dataset and continue recursion
    for value in df[best_attr].unique():
        sub_df = df[df[best_attr] == value].drop([best_attr], axis=1)
        result += f"{prefix}-- {value} --\n"
        result += ascii_tree(sub_df, depth + 1, prefix + "   ")
    
    return result

# Generate ASCII tree
ascii_output = ascii_tree(df)
print(ascii_output)

Difficulty
-- Low --
   Personality
   -- Hilarious --
      Leaf: -
   -- Boring --
      Leaf: +
-- Medium --
   Easy A
   -- No --
      Leaf: +
   -- Yes --
      Leaf: -
-- High --
   Leaf: -

