# **STAT 587 &mdash; Data Science I** &mdash; Homework 2
**Winter 2026**

## **Imports & Setup**

In [1]:
import sys
from pathlib import Path

# Add project root to Python path
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

# Ensure output directories exist
from src.paths import ensure_dirs, DATA_DIR, FIG_DIR, TAB_DIR

ensure_dirs()

In [9]:
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt

In [3]:
# Set Seed
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## **Question 1:** College Data (ISLP)


### **(a)** Linear Regression

### **(b)** Regression Tree

### **(c)** Pruning via Cross-Validation

### **(d)** Bagging

### **(e)** Random Forest

### **(f)** Comparison

## **Question 2:** Business School Admissions

In [4]:
# Load data
q2_df = pd.read_csv(DATA_DIR / "admission.csv")

In [None]:
# Initial EDA summary (saved to a table - can be commentted out)
de_levels = sorted(q2_df["De"].astype(str).unique().tolist())
group_levels = sorted(q2_df["Group"].unique().tolist())

class_counts = q2_df["Group"].value_counts().sort_index()

eda_summary = pd.DataFrame(
    {
        "item": [
            "n_rows",
            "n_cols",
            "columns",
            "De_levels",
            "Group_levels",
            "Group_counts",
        ],
        "value": [
            q2_df.shape[0],
            q2_df.shape[1],
            ", ".join(q2_df.columns.astype(str).tolist()),
            ", ".join(de_levels),
            ", ".join(map(str, group_levels)),
            "; ".join([f"{k}:{v}" for k, v in class_counts.items()]),
        ],
    }
)

eda_summary

Unnamed: 0,GPA,GMAT,De,Group
0,2.96,596,admit,1
1,3.14,473,admit,1
2,3.22,482,admit,1
3,3.29,527,admit,1
4,3.69,505,admit,1
...,...,...,...,...
80,3.05,399,border,3
81,2.85,483,border,3
82,3.01,453,border,3
83,3.03,414,border,3


In [6]:
# Sanity check: Verify categorical labels match numeric group coding
category_map = {"admit": 1, "notadmit": 2, "border": 3}

categories_match = (q2_df["De"].map(category_map) == q2_df["Group"]).all()
assert categories_match, "Mismatch between De labels and Group codes."

In [7]:
# Split:
## Last 4 observations in each category -> test
## Rest of observations -> train

q2_test_idx = q2_df.groupby("Group", sort=False).tail(4).index


q2_train_df = q2_df.drop(index=q2_test_idx).copy()
q2_test_df = q2_df.loc[q2_test_idx].copy()

# Features/labels
q2_X_train = q2_train_df[["GPA", "GMAT"]].to_numpy()
q2_y_train = q2_train_df["Group"].to_numpy()

q2_X_test = q2_test_df[["GPA", "GMAT"]].to_numpy()
q2_y_test = q2_test_df["Group"].to_numpy()

# Checks
assert len(q2_test_df) == 12
assert q2_train_df.shape[0] + q2_test_df.shape[0] == q2_df.shape[0]

### **(a)** Exploratory Analysis

In [40]:
# Color/label mapping for groups
group_labels = {1: "Admit", 2: "Not Admit", 3: "Border"}
group_colors = {1: "tab:blue", 2: "tab:orange", 3: "tab:green"}
order = [2, 3, 1]

fig_scatter, ax_scatter = plt.subplots(figsize=(6, 5))

for g in order:
    subset = q2_train_df[q2_train_df["Group"] == g]
    ax_scatter.scatter(
        subset["GPA"],
        subset["GMAT"],
        label=group_labels[g],
        color=group_colors[g],
        alpha=0.7,
        edgecolor="k",
        s=60,
    )

ax_scatter.set_xlabel("Undergraduate GPA", fontsize=12, labelpad=10)
ax_scatter.set_ylabel("GMAT Score", fontsize=12, labelpad=10)
ax_scatter.set_title(
    "Training Data: GPA vs GMAT by Admission Group", fontsize=14, fontweight="bold"
)
ax_scatter.legend()
ax_scatter.grid(True)
fig_scatter.tight_layout()

# Save figure
fig_scatter.savefig(FIG_DIR / "q2_scatter_gpa_gmat.png", bbox_inches="tight")
plt.close(fig_scatter)

In [None]:
order = [2, 3, 1]
labels = {2: "Not Admit", 3: "Border", 1: "Admit"}

q2_train_df_plot = q2_train_df.copy()
q2_train_df_plot["GroupOrdered"] = pd.Categorical(
    q2_train_df_plot["Group"].map(labels),
    categories=[labels[g] for g in order],
    ordered=True,
)

fig_box, axes = plt.subplots(1, 2, figsize=(10, 4))

# Boxplot: GPA by Group
q2_train_df_plot.boxplot(column="GPA", by="GroupOrdered", ax=axes[0], grid=False)
axes[0].set_title("GPA by Admission Group", fontsize=13)
axes[0].set_xlabel("Group", fontsize=12, labelpad=10)
axes[0].set_ylabel("GPA", fontsize=12, labelpad=10)

# Boxplot: GMAT by Group
q2_train_df_plot.boxplot(column="GMAT", by="GroupOrdered", ax=axes[1], grid=False)
axes[1].set_title("GMAT by Admission Group", fontsize=13)
axes[1].set_xlabel("Group", fontsize=12, labelpad=10)
axes[1].set_ylabel("GMAT Score", fontsize=12, labelpad=10)

fig_box.suptitle(
    "Training Data: Marginal Distributions by Group", fontsize=16, fontweight="bold"
)
fig_box.tight_layout()


# Save figure
fig_box.savefig(FIG_DIR / "q2_boxplots_gpa_gmat.png", bbox_inches="tight")
plt.close(fig_box)

### **(b)** LDA


### **(c)** QDA

### **(d)** KNN

### **(e)** Comparison