# Decision Trees Homework — Complete Solution Notebook

This notebook provides a complete, reproducible solution for the **Decision Trees** homework:
- Hand calculations: entropy, information gain, Gini
- Conceptual answers (in markdown)
- sklearn coding: training a tree, decision boundary, tree visualization, metrics
- ROC probability “chunkiness” + smoothing ideas
- Feature importance caveats
- Regression tree mini-demo (IC50 style)

> **Note:** Your numeric results may differ slightly if you change random seeds or train/test split.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_regression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_curve, auc, precision_recall_curve, average_precision_score
)
from sklearn.ensemble import RandomForestClassifier


## Problems 1, 4, 5, 6, 8, 9, 10, 11, 12 — Conceptual Answer Key

See the written answer key provided alongside this notebook (same content).

## Problem 2 — Entropy and Information Gain (By hand + verified by code)

In [None]:
# Dataset from Problem 2/3
data = [
    ("Yes","Yes",1),
    ("Yes","Yes",1),
    ("Yes","No",1),
    ("Yes","No",0),
    ("No","Yes",1),
    ("No","Yes",0),
    ("No","No",0),
    ("No","No",0),
    ("No","No",0),
    ("Yes","Yes",1),
]
df = pd.DataFrame(data, columns=["Smoking","Mutation","Disease"])
df


In [None]:
def entropy(y):
    y = np.array(y)
    n = len(y)
    if n == 0:
        return 0.0
    vals, counts = np.unique(y, return_counts=True)
    ps = counts / n
    return float(-np.sum([p*np.log2(p) for p in ps if p > 0]))

def info_gain(df, feature, target="Disease"):
    H = entropy(df[target])
    n = len(df)
    cond = 0.0
    for val, sub in df.groupby(feature):
        cond += (len(sub)/n)*entropy(sub[target])
    return H - cond

H_all = entropy(df["Disease"])
IG_S = info_gain(df, "Smoking")
IG_M = info_gain(df, "Mutation")

H_all, IG_S, IG_M


**Expected outputs (rounded):**
- Entropy of full dataset: ~0.971 bits
- IG(Smoking): ~0.322 bits
- IG(Mutation): ~0.125 bits  
So **Smoking** is the best root split (higher information gain).

## Problem 3 — Gini Impurity Split (By hand + verified by code)

In [None]:
def gini(y):
    y = np.array(y)
    n = len(y)
    if n == 0:
        return 0.0
    vals, counts = np.unique(y, return_counts=True)
    ps = counts / n
    return float(1 - np.sum(ps**2))

def weighted_gini_after_split(df, feature, target="Disease"):
    n = len(df)
    wg = 0.0
    for val, sub in df.groupby(feature):
        wg += (len(sub)/n)*gini(sub[target])
    return wg

G_all = gini(df["Disease"])
WG_S = weighted_gini_after_split(df, "Smoking")
WG_M = weighted_gini_after_split(df, "Mutation")

G_all, WG_S, WG_M


**Expected outputs (rounded):**
- Gini(full): 0.420
- Weighted Gini after split on Smoking: ~0.233
- Weighted Gini after split on Mutation: 0.400  
Lower is better ⇒ **Smoking** is again the best root split. So **Gini agrees with entropy here**.

## Problem 7 — Coding: Train & Visualize a Tree (sklearn)

In [None]:
# Generate 2D dataset
X, y = make_classification(
    n_samples=300,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_repeated=0,
    n_clusters_per_class=1,
    class_sep=1.2,
    random_state=0
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X_train, y_train)

yhat_train = tree.predict(X_train)
yhat_test = tree.predict(X_test)

train_acc = accuracy_score(y_train, yhat_train)
test_acc = accuracy_score(y_test, yhat_test)
cm = confusion_matrix(y_test, yhat_test)

train_acc, test_acc, cm


In [None]:
# Decision boundary plot (2D)
def plot_decision_boundary(clf, X, y, title="Decision boundary"):
    x_min, x_max = X[:,0].min() - 1.0, X[:,0].max() + 1.0
    y_min, y_max = X[:,1].min() - 1.0, X[:,1].max() + 1.0

    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 400),
        np.linspace(y_min, y_max, 400)
    )
    grid = np.c_[xx.ravel(), yy.ravel()]
    Z = clf.predict(grid).reshape(xx.shape)

    plt.figure(figsize=(7,5))
    plt.contourf(xx, yy, Z, alpha=0.25)
    plt.scatter(X[:,0], X[:,1], c=y, s=25)
    plt.title(title)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()

plot_decision_boundary(tree, X_train, y_train, title="Decision boundary (train), max_depth=3")


In [None]:
# Visualize the tree
plt.figure(figsize=(14,6))
plot_tree(tree, filled=True, feature_names=["Feature1","Feature2"], class_names=["0","1"], fontsize=9)
plt.show()


In [None]:
# Report metrics
print("Training accuracy:", train_acc)
print("Test accuracy:", test_acc)
print("\nConfusion matrix (test):\n", cm)
print("\nClassification report (test):\n", classification_report(y_test, yhat_test))


## Problem 8 — ROC curve and probability 'chunkiness'

In [None]:
# ROC for the single tree
proba_tree = tree.predict_proba(X_test)[:,1]
fpr, tpr, thresh = roc_curve(y_test, proba_tree)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC (Decision Tree) AUC={roc_auc:.3f}")
plt.show()

# Show how many distinct probability values appear
unique_probs = np.unique(np.round(proba_tree, 6))
len(unique_probs), unique_probs[:10]


In [None]:
# Smoother ROC idea 1: Random Forest (averages many trees → more probability levels)
rf = RandomForestClassifier(n_estimators=300, random_state=0)
rf.fit(X_train, y_train)

proba_rf = rf.predict_proba(X_test)[:,1]
fpr2, tpr2, _ = roc_curve(y_test, proba_rf)
roc_auc2 = auc(fpr2, tpr2)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Tree AUC={roc_auc:.3f}")
plt.plot(fpr2, tpr2, label=f"RF AUC={roc_auc2:.3f}")
plt.plot([0,1],[0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC: Tree vs Random Forest")
plt.legend()
plt.show()

len(np.unique(np.round(proba_rf, 6)))


## Problem 11 — Regression Trees (IC50-style) mini demo

In [None]:
# Create a regression dataset and fit a regression tree
Xr, yr = make_regression(n_samples=400, n_features=5, n_informative=3, noise=25.0, random_state=0)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.3, random_state=0)

rt = DecisionTreeRegressor(max_depth=4, random_state=0)
rt.fit(Xr_train, yr_train)

pred = rt.predict(Xr_test)

# Quick sanity metrics
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(yr_test, pred)
r2 = r2_score(yr_test, pred)

mse, r2


In [None]:
# Visualize regression tree structure
plt.figure(figsize=(16,6))
plot_tree(rt, filled=True, feature_names=[f"X{i}" for i in range(Xr.shape[1])], fontsize=8)
plt.show()
