In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import math
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
def entropy(labels):
    counts = Counter(labels)
    total = len(labels)
    ent = 0.0
    for v in counts.values():
        p = v/total
        ent -= p * math.log2(p)
    return ent

In [3]:
def information_gain(X_col, y):
    base_entropy = entropy(y)
    values = set(X_col)
    total = len(y)
    weighted_entropy = 0
    for v in values:
        subset_y = [y[i] for i in range(total) if X_col[i] == v]
        weighted_entropy += (len(subset_y)/total) * entropy(subset_y)
    return base_entropy - weighted_entropy

In [4]:
def gain_ratio(X_col, y):
    ig = information_gain(X_col, y)
    values = set(X_col)
    total = len(y)
    split_info = -sum((list(X_col).count(v)/total) * math.log2(list(X_col).count(v)/total) for v in values)
    return ig / split_info if split_info != 0 else 0

In [5]:
def id3(X, y, attributes):
    if len(set(y)) == 1:
        return list(set(y))[0]  # pure class
    if not attributes:
        return Counter(y).most_common(1)[0][0]  # majority class
    
    # choose attribute with max info gain
    gains = [information_gain([row[attr] for row in X], y) for attr in attributes]
    best_attr = attributes[np.argmax(gains)]
    
    tree = {best_attr: {}}
    values = set(row[best_attr] for row in X)
    
    for v in values:
        sub_X = [row for i,row in enumerate(X) if row[best_attr] == v]
        sub_y = [y[i] for i,row in enumerate(X) if row[best_attr] == v]
        if not sub_X:
            tree[best_attr][v] = Counter(y).most_common(1)[0][0]
        else:
            new_attrs = [a for a in attributes if a != best_attr]
            tree[best_attr][v] = id3(sub_X, sub_y, new_attrs)
    return tree

# =====================
# C4.5 Algorithm
# =====================

def c45(X, y, attributes):
    if len(set(y)) == 1:
        return list(set(y))[0]
    if not attributes:
        return Counter(y).most_common(1)[0][0]
    
    # choose attribute with max gain ratio
    ratios = [gain_ratio([row[attr] for row in X], y) for attr in attributes]
    best_attr = attributes[np.argmax(ratios)]
    
    tree = {best_attr: {}}
    values = set(row[best_attr] for row in X)
    
    for v in values:
        sub_X = [row for i,row in enumerate(X) if row[best_attr] == v]
        sub_y = [y[i] for i,row in enumerate(X) if row[best_attr] == v]
        if not sub_X:
            tree[best_attr][v] = Counter(y).most_common(1)[0][0]
        else:
            new_attrs = [a for a in attributes if a != best_attr]
            tree[best_attr][v] = c45(sub_X, sub_y, new_attrs)
    return tree


In [6]:


def predict(tree, sample, default=None):
    """Predict the label for one sample (dict of features)."""
    if not isinstance(tree, dict):
        return tree  # leaf node

    # get the attribute at this node
    attr = next(iter(tree))
    branches = tree[attr]

    # if sample[attr] not in branches, return majority class of this node
    value = sample.get(attr)
    if value not in branches:
        # majority class fallback
        leaves = []
        def collect_labels(subtree):
            if isinstance(subtree, dict):
                for v in subtree.values():
                    collect_labels(v)
            else:
                leaves.append(subtree)
        collect_labels(branches)
        return Counter(leaves).most_common(1)[0][0]

    return predict(branches[value], sample, default)


In [7]:
df = pd.read_csv('playCricket.csv')
display(df.head())

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayCricket
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [8]:
features = df.drop(["PlayCricket","Day"], axis=1).columns.tolist()
X = df.drop(["PlayCricket","Day"], axis=1).to_dict("records")   # list of dicts
y = df["PlayCricket"].tolist()

In [9]:
X

[{'Outlook': 'Sunny',
  'Temperature': 'Hot',
  'Humidity': 'High',
  'Wind': 'Weak'},
 {'Outlook': 'Sunny',
  'Temperature': 'Hot',
  'Humidity': 'High',
  'Wind': 'Strong'},
 {'Outlook': 'Overcast',
  'Temperature': 'Hot',
  'Humidity': 'High',
  'Wind': 'Weak'},
 {'Outlook': 'Rain',
  'Temperature': 'Mild',
  'Humidity': 'High',
  'Wind': 'Weak'},
 {'Outlook': 'Rain',
  'Temperature': 'Cool',
  'Humidity': 'Normal',
  'Wind': 'Weak'},
 {'Outlook': 'Rain',
  'Temperature': 'Cool',
  'Humidity': 'Normal',
  'Wind': 'Strong'},
 {'Outlook': 'Overcast',
  'Temperature': 'Cool',
  'Humidity': 'Normal',
  'Wind': 'Strong'},
 {'Outlook': 'Sunny',
  'Temperature': 'Mild',
  'Humidity': 'High',
  'Wind': 'Weak'},
 {'Outlook': 'Sunny',
  'Temperature': 'Cool',
  'Humidity': 'Normal',
  'Wind': 'Weak'},
 {'Outlook': 'Rain',
  'Temperature': 'Mild',
  'Humidity': 'Normal',
  'Wind': 'Weak'},
 {'Outlook': 'Sunny',
  'Temperature': 'Mild',
  'Humidity': 'Normal',
  'Wind': 'Strong'},
 {'Outlook': 

In [10]:
y

['No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No']

In [11]:
# =====================
# Cross Validation
# =====================

def evaluate(X, y, algorithm_func):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    metrics = {"accuracy": [], "f1": [], "precision": [], "recall": []}
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = [X[i] for i in train_idx], [X[i] for i in test_idx]
        y_train, y_test = [y[i] for i in train_idx], [y[i] for i in test_idx]
        
        attributes = list(X[0].keys())
        tree = algorithm_func(X_train, y_train, attributes)
        
        y_pred = [predict(tree, sample) for sample in X_test]
        
        metrics["accuracy"].append((accuracy_score(y_test, y_pred)))
        metrics["f1"].append(f1_score(y_test, y_pred, average="macro", zero_division=0))
        metrics["precision"].append(precision_score(y_test, y_pred, average="macro", zero_division=0))
        metrics["recall"].append(recall_score(y_test, y_pred, average="macro", zero_division=0))
    
    return {k: round(np.mean(v),4) for k,v in metrics.items()}




In [12]:
id3_tree = id3(X,y, features)
print("=== ID3 Tree ===")
print(id3_tree)

=== ID3 Tree ===
{'Outlook': {'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}}, 'Overcast': 'Yes', 'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [13]:
c45_tree = c45(X,y, features)
print("=== ID3 Tree ===")
print(c45_tree)

=== ID3 Tree ===
{'Outlook': {'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}}, 'Overcast': 'Yes', 'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [14]:

print("ID3 CV Results:", evaluate(X, y, id3))
print("C4.5 CV Results:", evaluate(X, y, c45))
print("-------------------------")

ID3 CV Results: {'accuracy': np.float64(0.9333), 'f1': np.float64(0.9333), 'precision': np.float64(0.95), 'recall': np.float64(0.95)}
C4.5 CV Results: {'accuracy': np.float64(0.6), 'f1': np.float64(0.5467), 'precision': np.float64(0.5667), 'recall': np.float64(0.6)}
-------------------------


In [15]:
for x in X:
    print(predict(id3_tree,x))

metrics = {"accuracy": [], "f1": [], "precision": [], "recall": []}
        
y_pred = [predict(id3_tree, sample) for sample in X]
        
metrics["accuracy"].append((accuracy_score(y, y_pred)))
metrics["f1"].append(f1_score(y, y_pred, average="macro", zero_division=0))
metrics["precision"].append(precision_score(y, y_pred, average="macro", zero_division=0))
metrics["recall"].append(recall_score(y, y_pred, average="macro", zero_division=0))

print(metrics)

No
No
Yes
Yes
Yes
No
Yes
No
Yes
Yes
Yes
Yes
Yes
No
{'accuracy': [1.0], 'f1': [1.0], 'precision': [1.0], 'recall': [1.0]}


In [16]:
for x in X:
    print(predict(c45_tree,x))


metrics = {"accuracy": [], "f1": [], "precision": [], "recall": []}
        
y_pred = [predict(c45_tree, sample) for sample in X]
        
metrics["accuracy"].append((accuracy_score(y, y_pred)))
metrics["f1"].append(f1_score(y, y_pred, average="macro", zero_division=0))
metrics["precision"].append(precision_score(y, y_pred, average="macro", zero_division=0))
metrics["recall"].append(recall_score(y, y_pred, average="macro", zero_division=0))

print(metrics)

No
No
Yes
Yes
Yes
No
Yes
No
Yes
Yes
Yes
Yes
Yes
No


{'accuracy': [1.0], 'f1': [1.0], 'precision': [1.0], 'recall': [1.0]}
