In [None]:
# ============================================================
# 05f_ipeds_cip_embedding_prediction.ipynb
# Numeric + Categorical Baselines + CIP Embedding Views
# Target: log1p(completers) (regression)
# ============================================================

import os
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr

# If running in Colab, uncomment:
# from google.colab import drive
# drive.mount("/content/drive")

# Root project directory (adjust if needed)
PROJ = "/content/drive/MyDrive/dissertation"

DATA_DIR = f"{PROJ}/data"
OUT_DIR = f"{PROJ}/outputs/ipeds_5f"
os.makedirs(OUT_DIR, exist_ok=True)

# ------------------------------------------------------------
# 1. Load IPEDS train / val splits
#    (Make sure these files exist at these paths)
# ------------------------------------------------------------
train_path = f"{DATA_DIR}/ipeds_train.csv"
val_path   = f"{DATA_DIR}/ipeds_val.csv"

ipeds_train = pd.read_csv(train_path)
ipeds_val   = pd.read_csv(val_path)

print("Train shape:", ipeds_train.shape)
print("Val shape:", ipeds_val.shape)
print("Columns:", ipeds_train.columns.tolist())

# ------------------------------------------------------------
# 2. Define target, IDs, and column groups (aligned with 5a_b)
# ------------------------------------------------------------
id_cols      = ["unitid", "year"]
target_col   = "completers"
high_cardols = ["cips"]   # high-card categorical, only used for embeddings

# Metadata categoricals (NOT used as model inputs)
metadata_cats = [
    "state_abbr",
    "inst_control",
    "urban_centric_locale",
    "inst_size",
    "cbsa_type",
    "inst_affiliation",
]

# Drop IDs / target / metadata / high-card from X space
all_cols = ipeds_train.columns.tolist()

base_feature_cols = [
    c for c in all_cols
    if c not in (id_cols + [target_col] + high_cardols + metadata_cats)
]

print("Base feature columns:", base_feature_cols)

# From 5a_b: only 'region' and 'sector' were used as model categoricals.
categorical_model_cols = [c for c in ["region", "sector"] if c in base_feature_cols]
numeric_cols = [c for c in base_feature_cols if c not in categorical_model_cols]

print("Numeric cols:", numeric_cols)
print("Categorical model cols:", categorical_model_cols)

# ------------------------------------------------------------
# 3. Prepare X / y for train / val
#    Target: log1p(completers)
# ------------------------------------------------------------
y_train = np.log1p(ipeds_train[target_col].values)
y_val   = np.log1p(ipeds_val[target_col].values)

X_train_base = ipeds_train[base_feature_cols].copy()
X_val_base   = ipeds_val[base_feature_cols].copy()

# We'll need the raw CIPs column later for embeddings
cip_col = "cips"
cip_train_str = ipeds_train[cip_col].astype(str)
cip_val_str   = ipeds_val[cip_col].astype(str)

print("Example CIPs (train):")
print(cip_train_str.head())


Train shape: (6602, 21)
Val shape: (2201, 21)
Columns: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime', 'region', 'sector', 'completers', 'unitid', 'year', 'cips', 'state_abbr', 'inst_control', 'urban_centric_locale', 'inst_size', 'cbsa_type', 'inst_affiliation']
Base feature columns: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime', 'region', 'sector']
Numeric cols: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime']
Categorical model cols: ['region', 'sector']
Example CIPs (train):
0    04.0501,09.0101,09.0102,11.0101,11.0103,12.059...
1                        

In [None]:
# ------------------------------------------------------------
# 4. Evaluation helper
# ------------------------------------------------------------
from math import sqrt

def eval_regression(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)   # no 'squared' arg
    rmse = sqrt(mse)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    rho, _ = spearmanr(y_true, y_pred)
    return {
        "view": name,
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
        "spearman": rho,
    }


In [None]:
# 5. Baseline model: numeric + OHE (region, sector)

ohe_preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_model_cols),
    ]
)

base_model = Pipeline(
    steps=[
        ("preprocess", ohe_preprocessor),
        ("regressor", HistGradientBoostingRegressor(random_state=42)),
    ]
)

base_model.fit(X_train_base, y_train)

y_val_pred_base = base_model.predict(X_val_base)

results = []
results.append(eval_regression("HGBR + Numeric + OHE(region, sector)", y_val, y_val_pred_base))

pd.DataFrame(results)


Unnamed: 0,view,rmse,mae,r2,spearman
0,"HGBR + Numeric + OHE(region, sector)",0.581262,0.35295,0.873117,0.932018


In [None]:
# ------------------------------------------------------------
# 6. Load CIP embeddings (Node2Vec + Poincaré)
#    Same files as in 05c / 05d / 05e
# ------------------------------------------------------------
NODE2VEC_FILE = f"{PROJ}/outputs/embeddings/cip_embeddings_graph_64.csv"
POINCARE_FILE = f"{PROJ}/outputs/embeddings/cip_embeddings_poincare_ipeds_64.csv"

def load_cip_embedding(path):
    df = pd.read_csv(path)
    # Try to detect the CIP code column
    id_col = None
    for cand in ["cip", "cips", "code"]:
        if cand in df.columns:
            id_col = cand
            break
    if id_col is None:
        # if first column is CIP code
        id_col = df.columns[0]
    df[id_col] = df[id_col].astype(str).str.strip()
    return df, id_col

E_n2v_df, n2v_id = load_cip_embedding(NODE2VEC_FILE)
E_poin_df, poin_id = load_cip_embedding(POINCARE_FILE)

print("Node2Vec embedding shape:", E_n2v_df.shape, "CIP col:", n2v_id)
print("Poincaré embedding shape:", E_poin_df.shape, "CIP col:", poin_id)


Node2Vec embedding shape: (1585, 65) CIP col: cip
Poincaré embedding shape: (1574, 65) CIP col: cip


In [None]:
# ------------------------------------------------------------
# 7. Build CIP → embedding lookup dicts and
#    create row-level aggregated CIP embeddings (mean over CIPs)
# ------------------------------------------------------------
def build_cip_lookup(emb_df, id_col):
    emb_cols = [c for c in emb_df.columns if c != id_col]
    lookup = {}
    for _, row in emb_df.iterrows():
        code = str(row[id_col]).strip()
        vec = row[emb_cols].values.astype(float)
        lookup[code] = vec
    return lookup, emb_cols

n2v_lookup, n2v_emb_cols = build_cip_lookup(E_n2v_df, n2v_id)
poin_lookup, poin_emb_cols = build_cip_lookup(E_poin_df, poin_id)

print("Node2Vec embedding dims:", len(n2v_emb_cols))
print("Poincaré embedding dims:", len(poin_emb_cols))

def aggregate_cip_embeddings(cip_series, lookup, prefix="cip_n2v"):
    """
    cip_series: pd.Series of strings like "04.0501,09.0101,11.0101"
    lookup: dict CIP -> embedding vector
    Returns: DataFrame with aggregated embeddings (mean over CIPs)
    """
    rows = []
    for s in cip_series:
        s = str(s).replace(" ", "")
        codes = [c for c in s.split(",") if c]
        vecs = [lookup[c] for c in codes if c in lookup]
        if vecs:
            mat = np.stack(vecs, axis=0)
            agg = mat.mean(axis=0)
        else:
            # no CIP found in lookup → zeros
            agg = np.zeros(len(next(iter(lookup.values()))), dtype=float)
        rows.append(agg)

    arr = np.vstack(rows)
    cols = [f"{prefix}_{i}" for i in range(arr.shape[1])]
    return pd.DataFrame(arr, columns=cols)

# Build row-level CIP embeddings for train / val
cip_n2v_train = aggregate_cip_embeddings(cip_train_str, n2v_lookup, prefix="cip_n2v")
cip_n2v_val   = aggregate_cip_embeddings(cip_val_str,   n2v_lookup, prefix="cip_n2v")

cip_poin_train = aggregate_cip_embeddings(cip_train_str, poin_lookup, prefix="cip_poin")
cip_poin_val   = aggregate_cip_embeddings(cip_val_str,   poin_lookup, prefix="cip_poin")

print("cip_n2v_train shape:", cip_n2v_train.shape)
print("cip_poin_train shape:", cip_poin_train.shape)


Node2Vec embedding dims: 64
Poincaré embedding dims: 64
cip_n2v_train shape: (6602, 64)
cip_poin_train shape: (6602, 64)


In [None]:
# ------------------------------------------------------------
# 8. Helper to fit & evaluate a view
#    (baseline, + Node2Vec, + Poincaré)
# ------------------------------------------------------------
def fit_view_with_embeddings(
    view_name,
    X_train_base,
    X_val_base,
    y_train,
    y_val,
    emb_train_df=None,
    emb_val_df=None,
):
    """
    If emb_* is provided, we append those columns to the numeric set.
    """
    if emb_train_df is not None:
        # Concatenate embedding columns
        X_train_view = pd.concat([X_train_base.reset_index(drop=True),
                                  emb_train_df.reset_index(drop=True)], axis=1)
        X_val_view   = pd.concat([X_val_base.reset_index(drop=True),
                                  emb_val_df.reset_index(drop=True)], axis=1)
        emb_cols = emb_train_df.columns.tolist()
    else:
        X_train_view = X_train_base.copy()
        X_val_view   = X_val_base.copy()
        emb_cols = []

    # Numeric = original numeric + embedding cols
    numeric_view_cols = numeric_cols + emb_cols

    # Sanity: ensure all cols exist in X_train_view
    numeric_view_cols = [c for c in numeric_view_cols if c in X_train_view.columns]
    cat_view_cols     = [c for c in categorical_model_cols if c in X_train_view.columns]

    pre = ColumnTransformer(
        transformers=[
            ("num", "passthrough", numeric_view_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_view_cols),
        ]
    )

    model = Pipeline(
        steps=[
            ("preprocess", pre),
            ("regressor", HistGradientBoostingRegressor(random_state=42)),
        ]
    )

    model.fit(X_train_view, y_train)
    y_val_pred = model.predict(X_val_view)

    return eval_regression(view_name, y_val, y_val_pred)

# ------------------------------------------------------------
# 9. Run all three views: baseline, +Node2Vec, +Poincaré
# ------------------------------------------------------------
results = []

# View 1: baseline (already fit above, but re-evaluate via helper for consistency)
baseline_res = fit_view_with_embeddings(
    view_name="Baseline: Numeric + OHE(region, sector)",
    X_train_base=X_train_base,
    X_val_base=X_val_base,
    y_train=y_train,
    y_val=y_val,
    emb_train_df=None,
    emb_val_df=None,
)
results.append(baseline_res)

# View 2: baseline + CIP Node2Vec
n2v_res = fit_view_with_embeddings(
    view_name="Numeric + OHE + CIP Node2Vec (64D)",
    X_train_base=X_train_base,
    X_val_base=X_val_base,
    y_train=y_train,
    y_val=y_val,
    emb_train_df=cip_n2v_train,
    emb_val_df=cip_n2v_val,
)
results.append(n2v_res)

# View 3: baseline + CIP Poincaré
poin_res = fit_view_with_embeddings(
    view_name="Numeric + OHE + CIP Poincaré (64D)",
    X_train_base=X_train_base,
    X_val_base=X_val_base,
    y_train=y_train,
    y_val=y_val,
    emb_train_df=cip_poin_train,
    emb_val_df=cip_poin_val,
)
results.append(poin_res)

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,view,rmse,mae,r2,spearman
0,"Baseline: Numeric + OHE(region, sector)",0.581262,0.35295,0.873117,0.932018
1,Numeric + OHE + CIP Node2Vec (64D),0.428623,0.274823,0.931006,0.965032
2,Numeric + OHE + CIP Poincaré (64D),0.437416,0.282828,0.928146,0.962473


In [None]:
# ------------------------------------------------------------
# 10. Save results to CSV
# ------------------------------------------------------------
metrics_path = os.path.join(OUT_DIR, "ipeds_5f_regression_views.csv")
results_df.to_csv(metrics_path, index=False)
print("Saved metrics →", metrics_path)
print(results_df)


Saved metrics → /content/drive/MyDrive/dissertation/outputs/ipeds_5f/ipeds_5f_regression_views.csv
                                      view      rmse       mae        r2  \
0  Baseline: Numeric + OHE(region, sector)  0.581262  0.352950  0.873117   
1       Numeric + OHE + CIP Node2Vec (64D)  0.428623  0.274823  0.931006   
2       Numeric + OHE + CIP Poincaré (64D)  0.437416  0.282828  0.928146   

   spearman  
0  0.932018  
1  0.965032  
2  0.962473  
