In [None]:
# ============================================================
# 05h_ipeds_synthetic_tstr_trts.ipynb
# TSTR / TRTS Evaluation for IPEDS Completers (Regression)
# Models: CTGAN, TVAE, LLM synthetic data
# Base model: HistGradientBoostingRegressor
# ============================================================

import os
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr

# ------------------------------------------------------------
# Paths / project dirs
# ------------------------------------------------------------
# If in Colab and Drive not mounted, uncomment:
from google.colab import drive
drive.mount("/content/drive")

PROJ = "/content/drive/MyDrive/dissertation"

DATA_DIR   = f"{PROJ}/data"
SYNTH_DIR  = f"{PROJ}/outputs/ipeds_5g_synth"   # where CTGAN/TVAE/LLM CSVs live
OUT_DIR    = f"{PROJ}/outputs/ipeds_5h_tstr_trts"
os.makedirs(OUT_DIR, exist_ok=True)

train_path = f"{DATA_DIR}/ipeds_train.csv"
val_path   = f"{DATA_DIR}/ipeds_val.csv"

ipeds_train = pd.read_csv(train_path)
ipeds_val   = pd.read_csv(val_path)

print("Real train shape:", ipeds_train.shape)
print("Real val shape:", ipeds_val.shape)
print("Columns:", ipeds_train.columns.tolist())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Real train shape: (6602, 21)
Real val shape: (2201, 21)
Columns: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime', 'region', 'sector', 'completers', 'unitid', 'year', 'cips', 'state_abbr', 'inst_control', 'urban_centric_locale', 'inst_size', 'cbsa_type', 'inst_affiliation']


In [None]:
# ------------------------------------------------------------
# Column definitions (aligned with 05f / 05g)
# ------------------------------------------------------------
target_col = "completers"

feature_cols = [
    "longitude",
    "latitude",
    "student_faculty_ratio",
    "headcount",
    "cbsa",
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
    "enrolled_graduate_parttime",
    "region",
    "sector",
]

numeric_cols = [
    "longitude",
    "latitude",
    "student_faculty_ratio",
    "headcount",
    "cbsa",
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
    "enrolled_graduate_parttime",
]

categorical_model_cols = ["region", "sector"]

print("Feature cols:", feature_cols)
print("Numeric cols:", numeric_cols)
print("Categorical model cols:", categorical_model_cols)

# ------------------------------------------------------------
# Helper: evaluation metrics
# ------------------------------------------------------------
def eval_regression(view_name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)   # no 'squared' arg
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    rho, _ = spearmanr(y_true, y_pred)
    return {
        "view": view_name,
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
        "spearman": rho,
    }


Feature cols: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime', 'region', 'sector']
Numeric cols: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime']
Categorical model cols: ['region', 'sector']


In [None]:
# ------------------------------------------------------------
# Preprocessor + model builder
# ------------------------------------------------------------
def make_model():
    preproc = ColumnTransformer(
        transformers=[
            ("num", "passthrough", numeric_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_model_cols),
        ]
    )

    model = HistGradientBoostingRegressor(random_state=42)
    pipe = Pipeline(
        steps=[
            ("preprocess", preproc),
            ("regressor", model),
        ]
    )
    return pipe

# ------------------------------------------------------------
# Prepare real X / y (log1p target)
# ------------------------------------------------------------
# Drop rows with missing target just in case
ipeds_train_clean = ipeds_train.dropna(subset=[target_col]).copy()
ipeds_val_clean   = ipeds_val.dropna(subset=[target_col]).copy()

y_train_real = np.log1p(ipeds_train_clean[target_col].values)
y_val_real   = np.log1p(ipeds_val_clean[target_col].values)

X_train_real = ipeds_train_clean[feature_cols].copy()
X_val_real   = ipeds_val_clean[feature_cols].copy()

print("Clean real train:", X_train_real.shape, "val:", X_val_real.shape)

# ------------------------------------------------------------
# Sanity cleaning of real features (types)
# ------------------------------------------------------------
# Coerce numeric
for c in numeric_cols:
    X_train_real[c] = pd.to_numeric(X_train_real[c], errors="coerce")
    X_val_real[c]   = pd.to_numeric(X_val_real[c], errors="coerce")

# Drop any leftover NaNs in features (keep it simple)
mask_train = ~X_train_real[numeric_cols].isna().any(axis=1)
mask_val   = ~X_val_real[numeric_cols].isna().any(axis=1)

X_train_real = X_train_real[mask_train]
y_train_real = y_train_real[mask_train]

X_val_real = X_val_real[mask_val]
y_val_real = y_val_real[mask_val]

print("After feature NaN drop: real train:", X_train_real.shape, "val:", X_val_real.shape)

Clean real train: (6602, 11) val: (2201, 11)
After feature NaN drop: real train: (6602, 11) val: (2201, 11)


In [None]:
def eval_regression(view_name, y_true, y_pred):
    # This sklearn version doesn't support squared=False, so do it by hand
    mse = mean_squared_error(y_true, y_pred)  # no squared=...
    rmse = np.sqrt(mse)

    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    rho, _ = spearmanr(y_true, y_pred)

    return {
        "view": view_name,
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
        "spearman": rho,
    }


In [None]:
# ------------------------------------------------------------
# Baseline: Real → Real
# ------------------------------------------------------------
results = []

baseline_model = make_model()
baseline_model.fit(X_train_real, y_train_real)
y_val_pred_base = baseline_model.predict(X_val_real)

results.append(
    eval_regression("Real→Real (baseline)", y_val_real, y_val_pred_base)
)

print("Baseline done.")

Baseline done.


In [None]:
# ============================================================
# PATCH: make synthetic region/sector compatible with real data
# ============================================================

# 1. Make sure real train/val categoricals are strings
for c in categorical_model_cols:  # ['region', 'sector']
    ipeds_train_clean[c] = ipeds_train_clean[c].astype(str)
    ipeds_val_clean[c]   = ipeds_val_clean[c].astype(str)

# 2. Re-define helper to load & align synthetic datasets
def load_and_align_synth(path, name, ipeds_train_for_fill, random_state=42):
    df = pd.read_csv(path)
    print(f"\nLoaded {name} synthetic:", df.shape)

    # Ensure all expected columns exist; if missing, sample from real train
    for col in feature_cols + [target_col]:
        if col not in df.columns:
            print(f"  [WARN] {name}: missing column '{col}', sampling from real data.")
            sampled = (
                ipeds_train_for_fill[col]
                .dropna()
                .sample(n=len(df), replace=True, random_state=random_state)
                .reset_index(drop=True)
            )
            df[col] = sampled

    # Keep only the columns we care about, in the right order
    df = df[feature_cols + [target_col]].copy()

    # Coerce numeric columns + target to numeric
    for c in numeric_cols + [target_col]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Drop rows with NaN target
    before = len(df)
    df = df.dropna(subset=[target_col])
    after = len(df)
    if after < before:
        print(f"  Dropped {before - after} rows with NaN {target_col} in {name}.")

    # Fill remaining NaNs in numeric features with real medians
    for c in numeric_cols:
        med = ipeds_train_for_fill[c].median()
        df[c] = df[c].fillna(med)

    # Overwrite region & sector with *real* categories so OHE sees same domain
    for c in categorical_model_cols:  # ['region', 'sector']
        if c in df.columns:
            sampled = (
                ipeds_train_for_fill[c]
                .dropna()
                .sample(n=len(df), replace=True, random_state=random_state)
                .reset_index(drop=True)
            )
            df[c] = sampled.astype(str)

    return df


In [None]:
# ------------------------------------------------------------
# Load synthetic datasets
# ------------------------------------------------------------
ctgan_path = f"{SYNTH_DIR}/synthetic_ctgan.csv"
tvae_path  = f"{SYNTH_DIR}/synthetic_tvae.csv"
llm_path   = f"{SYNTH_DIR}/synthetic_llm.csv"

synth_ctgan = load_and_align_synth(ctgan_path, "CTGAN", ipeds_train_clean)
synth_tvae  = load_and_align_synth(tvae_path,  "TVAE",  ipeds_train_clean)
synth_llm   = load_and_align_synth(llm_path,   "LLM",   ipeds_train_clean)
# ------------------------------------------------------------
# Helper: run a single TSTR/TRTS view with safe target cleaning
# ------------------------------------------------------------
def run_view(view_name, train_df, test_df):
    # Work on copies so we don't mutate originals
    train_df = train_df[feature_cols + [target_col]].copy()
    test_df  = test_df[feature_cols + [target_col]].copy()

    # Coerce target to numeric
    train_df[target_col] = pd.to_numeric(train_df[target_col], errors="coerce")
    test_df[target_col]  = pd.to_numeric(test_df[target_col], errors="coerce")

    # Drop NaN targets
    train_df = train_df.dropna(subset=[target_col])
    test_df  = test_df.dropna(subset=[target_col])

    # Drop negative targets (log1p undefined / NaN for < -1 and weird for < 0)
    train_df = train_df[train_df[target_col] >= 0]
    test_df  = test_df[test_df[target_col] >= 0]

    # Now safely build X / y
    X_train = train_df[feature_cols]
    X_val   = test_df[feature_cols]

    y_train = np.log1p(train_df[target_col].astype(float).values)
    y_val   = np.log1p(test_df[target_col].astype(float).values)

    # Sanity check (optional, you can comment these out later)
    # print(view_name, "min target (train,val):",
    #       train_df[target_col].min(), test_df[target_col].min())

    model = make_model()
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)

    return eval_regression(view_name, y_val, y_val_pred)


Loaded CTGAN synthetic: (6602, 12)

Loaded TVAE synthetic: (6602, 12)

Loaded LLM synthetic: (2957, 11)
  [WARN] LLM: missing column 'enrolled_graduate_parttime', sampling from real data.


In [None]:
# TSTR
results.append(
    run_view("CTGAN TSTR (train CTGAN, test real)", synth_ctgan, ipeds_val_clean)
)
results.append(
    run_view("TVAE TSTR (train TVAE, test real)", synth_tvae, ipeds_val_clean)
)
results.append(
    run_view("LLM TSTR (train LLM, test real)", synth_llm, ipeds_val_clean)
)

# TRTS
results.append(
    run_view("CTGAN TRTS (train real, test CTGAN)", ipeds_train_clean, synth_ctgan)
)
results.append(
    run_view("TVAE TRTS (train real, test TVAE)", ipeds_train_clean, synth_tvae)
)
results.append(
    run_view("LLM TRTS (train real, test LLM)", ipeds_train_clean, synth_llm)
)

results_df = pd.DataFrame(results)
metrics_path = os.path.join(OUT_DIR, "ipeds_05h_tstr_trts_metrics.csv")
results_df.to_csv(metrics_path, index=False)

print("\nSaved TSTR/TRTS metrics →", metrics_path)
results_df



Saved TSTR/TRTS metrics → /content/drive/MyDrive/dissertation/outputs/ipeds_5h_tstr_trts/ipeds_05h_tstr_trts_metrics.csv


Unnamed: 0,view,rmse,mae,r2,spearman
0,Real→Real (baseline),0.581262,0.35295,0.873117,0.932018
1,"CTGAN TSTR (train CTGAN, test real)",1.140395,0.840173,0.511606,0.763805
2,"TVAE TSTR (train TVAE, test real)",1.0033,0.711526,0.621974,0.834649
3,"LLM TSTR (train LLM, test real)",1.398505,1.072539,0.265507,0.537957
4,"CTGAN TRTS (train real, test CTGAN)",1.511696,1.151499,0.058879,0.575456
5,"TVAE TRTS (train real, test TVAE)",1.032613,0.745622,0.380654,0.760768
6,"LLM TRTS (train real, test LLM)",1.944708,1.519293,-0.414061,0.085118
