# **STAT 587 &mdash; Data Science I** &mdash; Homework 2
**Winter 2026**

## **Imports & Setup**

In [1]:
import sys
from pathlib import Path

# Add project root to Python path
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

# Ensure output directories exist
from src.paths import ensure_dirs, DATA_DIR, FIG_DIR, TAB_DIR

ensure_dirs()

In [43]:
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix

In [3]:
# Set Seed
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## Helper Functions

In [63]:
# Dataframes to LaTex Tables

def df_to_tabular_tex(df, *, float_fmt="%.4f", index=True):
    """Return LaTeX tabular with booktabs formatting."""
    return df.to_latex(
        index=index,
        escape=False,
        float_format=(lambda x: float_fmt % x) if float_fmt else None,
        bold_rows=False,
        longtable=False,
    )


def wrap_table(tabular_tex, *, caption, label):
    """Wrap a tabular in a standalone LaTeX table environment."""
    return "\n".join(
        [
            r"\begin{table}[H]",
            r"\begin{center}",
            tabular_tex.strip(),
            r"\end{center}",
            r"\vspace{-5pt}",
            rf"\caption{{{caption}}}",
            rf"\label{{{label}}}",
            r"\end{table}",
            "",
        ]
    )

## **Question 1:** College Data (ISLP)


### **(a)** Linear Regression

### **(b)** Regression Tree

### **(c)** Pruning via Cross-Validation

### **(d)** Bagging

### **(e)** Random Forest

### **(f)** Comparison

## **Question 2:** Business School Admissions

In [4]:
# Load data
q2_df = pd.read_csv(DATA_DIR / "admission.csv")

In [8]:
# Initial EDA summary (saved to a table - can be commentted out)
de_levels = sorted(q2_df["De"].astype(str).unique().tolist())
group_levels = sorted(q2_df["Group"].unique().tolist())

class_counts = q2_df["Group"].value_counts().sort_index()

eda_summary = pd.DataFrame(
    {
        "item": [
            "n_rows",
            "n_cols",
            "columns",
            "De_levels",
            "Group_levels",
            "Group_counts",
        ],
        "value": [
            q2_df.shape[0],
            q2_df.shape[1],
            ", ".join(q2_df.columns.astype(str).tolist()),
            ", ".join(de_levels),
            ", ".join(map(str, group_levels)),
            "; ".join([f"{k}:{v}" for k, v in class_counts.items()]),
        ],
    }
)

eda_summary

Unnamed: 0,GPA,GMAT,De,Group
0,2.96,596,admit,1
1,3.14,473,admit,1
2,3.22,482,admit,1
3,3.29,527,admit,1
4,3.69,505,admit,1
...,...,...,...,...
80,3.05,399,border,3
81,2.85,483,border,3
82,3.01,453,border,3
83,3.03,414,border,3


In [6]:
# Sanity check: Verify categorical labels match numeric group coding
category_map = {"admit": 1, "notadmit": 2, "border": 3}

categories_match = (q2_df["De"].map(category_map) == q2_df["Group"]).all()
assert categories_match, "Mismatch between De labels and Group codes."

In [7]:
# Split:
## Last 4 observations in each category -> test
## Rest of observations -> train

q2_test_idx = q2_df.groupby("Group", sort=False).tail(4).index


q2_train_df = q2_df.drop(index=q2_test_idx).copy()
q2_test_df = q2_df.loc[q2_test_idx].copy()

# Features/labels
q2_X_train = q2_train_df[["GPA", "GMAT"]].to_numpy()
q2_y_train = q2_train_df["Group"].to_numpy()

q2_X_test = q2_test_df[["GPA", "GMAT"]].to_numpy()
q2_y_test = q2_test_df["Group"].to_numpy()

# Checks
assert len(q2_test_df) == 12
assert q2_train_df.shape[0] + q2_test_df.shape[0] == q2_df.shape[0]

In [52]:
# Helper Function for plotting
def plot_q2_scatter(df, ax, order, labels, colors, title=None, alpha=0.7):
    """Scatter plot of GPA vs GMAT colored by admission group."""
    for g in order:
        subset = df[df["Group"] == g]
        ax.scatter(
            subset["GPA"],
            subset["GMAT"],
            label=labels[g],
            color=colors[g],
            alpha=alpha,
            edgecolor="k",
            s=60,
        )
    ax.set_xlabel("Undergraduate GPA", fontsize=12, labelpad=10)
    ax.set_ylabel("GMAT Score", fontsize=12, labelpad=10)
    if title:
        ax.set_title(title, fontsize=14, fontweight="bold")
    ax.legend()
    ax.grid(True)

### **2(a)** Exploratory Analysis

In [56]:
# Color/label mapping for groups
q2_labels = {1: "Admit", 2: "Not Admit", 3: "Border"}
q2_colors = {1: "tab:blue", 2: "tab:orange", 3: "tab:green"}
q2_order = [2, 3, 1]

In [57]:
# Scatter plot: GPA vs GMAT, colored by groups
fig_scatter, ax_scatter = plt.subplots(figsize=(6, 5))

plot_q2_scatter(
    df=q2_train_df,
    ax=ax_scatter,
    order=q2_order,
    labels=q2_labels,
    colors=q2_colors,
    title="Training Data: GPA vs GMAT by Admission Group",
    alpha=0.7,
)

fig_scatter.tight_layout()

# Save figure
fig_scatter.savefig(FIG_DIR / "q2_scatter_gpa_gmat.png", bbox_inches="tight")
plt.close(fig_scatter)

In [54]:
q2_train_df_plot = q2_train_df.copy()
q2_train_df_plot["GroupOrdered"] = pd.Categorical(
    q2_train_df_plot["Group"].map(q2_labels),
    categories=[q2_labels[g] for g in q2_order],
    ordered=True,
)

fig_box, axes = plt.subplots(1, 2, figsize=(10, 4))

# Boxplot: GPA by Group
q2_train_df_plot.boxplot(column="GPA", by="GroupOrdered", ax=axes[0], grid=False)
axes[0].set_title("GPA by Admission Group", fontsize=13)
axes[0].set_xlabel("Group", fontsize=12, labelpad=10)
axes[0].set_ylabel("GPA", fontsize=12, labelpad=10)

# Boxplot: GMAT by Group
q2_train_df_plot.boxplot(column="GMAT", by="GroupOrdered", ax=axes[1], grid=False)
axes[1].set_title("GMAT by Admission Group", fontsize=13)
axes[1].set_xlabel("Group", fontsize=12, labelpad=10)
axes[1].set_ylabel("GMAT Score", fontsize=12, labelpad=10)

fig_box.suptitle(
    "Training Data: Marginal Distributions by Group", fontsize=16, fontweight="bold"
)
fig_box.tight_layout()


# Save figure
fig_box.savefig(FIG_DIR / "q2_boxplots_gpa_gmat.png", bbox_inches="tight")
plt.close(fig_box)

### **2(b)** LDA

In [44]:
# Fit LDA on training data
q2_lda = LinearDiscriminantAnalysis()
q2_lda.fit(q2_X_train, q2_y_train)

# Predictions
q2_lda_train_pred = q2_lda.predict(q2_X_train)
q2_lda_test_pred = q2_lda.predict(q2_X_test)

# Sanity checks
assert len(q2_lda_train_pred) == len(q2_y_train)
assert len(q2_lda_test_pred) == len(q2_y_test)

In [None]:
# Build a mesh over predictor space
gpa_min, gpa_max = q2_train_df["GPA"].min() - 0.05, q2_train_df["GPA"].max() + 0.05
gmat_min, gmat_max = q2_train_df["GMAT"].min() - 10, q2_train_df["GMAT"].max() + 10

xx, yy = np.meshgrid(
    np.linspace(gpa_min, gpa_max, 300), np.linspace(gmat_min, gmat_max, 300)
)

grid = np.c_[xx.ravel(), yy.ravel()]
Z = q2_lda.predict(grid).reshape(xx.shape)

# Plot decision regions + training points
fig_lda, ax_lda = plt.subplots(figsize=(6.5, 5.5))

# Decision regions
ax_lda.contourf(xx, yy, Z, alpha=0.18)

# Decision boundaries
ax_lda.contour(xx, yy, Z, levels=[1.5, 2.5], colors="k", linewidths=1)

plot_q2_scatter(
    df=q2_train_df,
    ax=ax_lda,
    order=q2_order,
    labels=q2_labels,
    colors=q2_colors,
    title="LDA Decision Regions (Training Data)",
    alpha=0.8,
)

fig_lda.tight_layout()

# Save plot artifact
fig_lda.savefig(FIG_DIR / "q2_lda_boundary.png", bbox_inches="tight")
plt.close(fig_lda)

In [59]:
# Confusion matrices
# Rows = true class, Columns = predicted class
# Order: Not Admit (2), Border (3), Admit (1)
q2_lda_cm_train = confusion_matrix(
    q2_y_train,
    q2_lda_train_pred,
    labels=q2_order
)

q2_lda_cm_test = confusion_matrix(
    q2_y_test,
    q2_lda_test_pred,
    labels=q2_order
)

# Overall misclassification rates
q2_lda_train_err = 1.0 - np.mean(q2_lda_train_pred == q2_y_train)
q2_lda_test_err = 1.0 - np.mean(q2_lda_test_pred == q2_y_test)

# Sanity checks
assert q2_lda_cm_train.shape == (3, 3)
assert q2_lda_cm_test.shape == (3, 3)
assert 0.0 <= q2_lda_train_err <= 1.0
assert 0.0 <= q2_lda_test_err <= 1.0

In [64]:
# Names in order: Not Admit, Border, Admit
q2_group_names = [q2_labels[g] for g in q2_order]

# Confusion matrices as labeled DataFrames
q2_lda_cm_train_df = pd.DataFrame(
    q2_lda_cm_train, index=q2_group_names, columns=q2_group_names
)
q2_lda_cm_test_df = pd.DataFrame(
    q2_lda_cm_test, index=q2_group_names, columns=q2_group_names
)

# Error-rate summary
q2_lda_metrics_df = pd.DataFrame(
    {
        "Dataset": ["Training", "Test"],
        "Misclassification rate": [q2_lda_train_err, q2_lda_test_err],
    }
)

# Build tabular pieces
train_tabular = df_to_tabular_tex(q2_lda_cm_train_df, index=True, float_fmt=None)
test_tabular = df_to_tabular_tex(q2_lda_cm_test_df, index=True, float_fmt=None)
err_tabular = df_to_tabular_tex(q2_lda_metrics_df, index=False, float_fmt="%.4f")

# Wrap as complete tables
train_table = wrap_table(
    train_tabular,
    caption="LDA confusion matrix (training data). Rows are true classes and columns are predicted classes.",
    label="tab:q2_lda_cm_train",
)
test_table = wrap_table(
    test_tabular,
    caption="LDA confusion matrix (test data). Rows are true classes and columns are predicted classes.",
    label="tab:q2_lda_cm_test",
)
err_table = wrap_table(
    err_tabular,
    caption="LDA overall misclassification rates.",
    label="tab:q2_lda_error_rates",
)

# Combine into ONE file to input in Overleaf
q2_lda_all_tables_tex = "\n".join([train_table, test_table, err_table])

# Write output
(TAB_DIR / "q2_lda_tables.tex").write_text(q2_lda_all_tables_tex, encoding="utf-8")

1029


### **2(c)** QDA

### **2(d)** KNN

### **2(e)** Comparison