In [None]:
# ============================================================
# 05g_ipeds_synthetic_eval.ipynb
# Synthetic Data Generation for IPEDS
# Models: CTGAN, TVAE, LLM fine-tuning (small PNNL-style)
# Task: TSTR / TRTS for log1p(completers) regression
# ============================================================

!pip install "ctgan==0.11.1" --upgrade --no-cache-dir
!pip install transformers datasets accelerate --quiet

import os
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

import torch
from datasets import Dataset

# Synthetic models
from ctgan import CTGAN
from ctgan.synthesizers import TVAE

# ------------------------------------------------------------
# Mount Drive (if needed)
# ------------------------------------------------------------
try:
    from google.colab import drive
    if not os.path.exists("/content/drive/MyDrive"):
        drive.mount("/content/drive")
except:
    pass

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
PROJ = "/content/drive/MyDrive/dissertation"

DATA_DIR = f"{PROJ}/data"
EMB_DIR  = f"{PROJ}/outputs/embeddings_cip"
OUT_DIR  = f"{PROJ}/outputs/ipeds_5g_synth"
os.makedirs(OUT_DIR, exist_ok=True)

# ------------------------------------------------------------
# Load train/val
# ------------------------------------------------------------
ipeds_train = pd.read_csv(f"{DATA_DIR}/ipeds_train.csv")
ipeds_val   = pd.read_csv(f"{DATA_DIR}/ipeds_val.csv")

print("Train:", ipeds_train.shape, "Val:", ipeds_val.shape)


Collecting ctgan==0.11.1
  Downloading ctgan-0.11.1-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.14.0 (from ctgan==0.11.1)
  Downloading rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting Faker!=37.11.0,>=17 (from rdt>=1.14.0->ctgan==0.11.1)
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading ctgan-0.11.1-py3-none-any.whl (25 kB)
Downloading rdt-1.18.2-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m159.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker, rdt, ctgan
Successfully installed Faker-38.2.0 ctgan-0.11.1 rdt-1.18.2
Mounted at /content/drive
Train: (6602, 21) Val: (2201, 21)


In [None]:
id_cols      = ["unitid", "year"]
target_col   = "completers"
high_cardols = ["cips"]
metadata_cats = [
    "state_abbr","inst_control","urban_centric_locale",
    "inst_size","cbsa_type","inst_affiliation",
]

all_cols = ipeds_train.columns.tolist()

base_feature_cols = [
    c for c in all_cols
    if c not in (id_cols + [target_col] + high_cardols + metadata_cats)
]

categorical_model_cols = [c for c in ["region","sector"] if c in base_feature_cols]
numeric_cols = [c for c in base_feature_cols if c not in categorical_model_cols]

print("Numeric:", numeric_cols)
print("Categorical:", categorical_model_cols)


Numeric: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime']
Categorical: ['region', 'sector']


In [None]:
def load_embedding(path):
    df = pd.read_csv(path)
    df['cip'] = df['cip'].astype(str)
    return df.set_index("cip")

# Use YOUR actual file names
NODE2VEC_FILE = f"{PROJ}/outputs/embeddings/cip_embeddings_graph_64.csv"
POINCARE_FILE = f"{PROJ}/outputs/embeddings/cip_embeddings_poincare_ipeds_64.csv"

E_n2v = load_embedding(NODE2VEC_FILE)
E_poin = load_embedding(POINCARE_FILE)

print("Node2Vec:", E_n2v.shape)
print("Poincaré:", E_poin.shape)


Node2Vec: (1585, 64)
Poincaré: (1574, 64)


In [None]:
def aggregate_cips(cip_series, embedding_df):
    rows = []
    for s in cip_series:
        codes = [c.strip() for c in s.split(",")]
        vecs = [embedding_df.loc[c].values for c in codes if c in embedding_df.index]
        if len(vecs)==0:
            rows.append(np.zeros(embedding_df.shape[1]))
        else:
            rows.append(np.mean(vecs, axis=0))
    return np.array(rows)

cip_train = ipeds_train["cips"].astype(str)
cip_val   = ipeds_val["cips"].astype(str)

cip_n2v_train = aggregate_cips(cip_train, E_n2v)
cip_n2v_val   = aggregate_cips(cip_val,   E_n2v)

cip_poin_train = aggregate_cips(cip_train, E_poin)
cip_poin_val   = aggregate_cips(cip_val,   E_poin)


In [None]:
X_train_base = ipeds_train[base_feature_cols].copy()
X_val_base   = ipeds_val[base_feature_cols].copy()

y_train = np.log1p(ipeds_train[target_col].values)
y_val   = np.log1p(ipeds_val[target_col].values)

pre = ColumnTransformer([
    ("num", "passthrough", numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_model_cols),
])


In [None]:
def row_to_text(r):
    return (
        f"longitude: {r['longitude']}, "
        f"latitude: {r['latitude']}, "
        f"student_faculty_ratio: {r['student_faculty_ratio']}, "
        f"headcount: {r['headcount']}, "
        f"cbsa: {r['cbsa']}, "
        f"undergrad_ft: {r['enrolled_undergrad_fulltime']}, "
        f"undergrad_pt: {r['enrolled_undergrad_parttime']}, "
        f"grad_ft: {r['enrolled_graduate_fulltime']}, "
        f"grad_pt: {r['enrolled_graduate_parttime']}, "
        f"region: {r['region']}, "
        f"sector: {r['sector']}, "
        f"cips: {r['cips']}, "
        f"completers: {r['completers']}"
    )

train_texts = [row_to_text(r) for _, r in ipeds_train.iterrows()]
ds = Dataset.from_dict({"text": train_texts})


In [None]:
# ===== Disable W&B completely =====
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

# ----- Use a smaller model: distilgpt2 -----
model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# ===== Use a smaller subset of the data (e.g., 2000 rows) =====
# 'ds' was built earlier from ipeds_train via Dataset.from_dict({"text": train_texts})
N_SUBSET = min(2000, len(ds))
ds_small = ds.shuffle(seed=42).select(range(N_SUBSET))

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

ds_tok = ds_small.map(tokenize, batched=True, remove_columns=["text"])

# ----- Load smaller model -----
model = AutoModelForCausalLM.from_pretrained(model_name)
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# ===== Much lighter training config =====
training_args = TrainingArguments(
    output_dir=f"{OUT_DIR}/llm_model_distilgpt2",
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=1,   # just 1 epoch
    max_steps=200,        # hard cap on steps
    logging_steps=20,
    save_steps=200,       # save at the end basically
    report_to="none",     # no wandb / tb
    disable_tqdm=False,   # you can keep the progress bar
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=ds_tok,
)

trainer.train()



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss
20,2.179
40,1.0227
60,0.8402
80,0.7878
100,0.7631
120,0.7365
140,0.7278
160,0.7289
180,0.7197
200,0.7234


TrainOutput(global_step=200, training_loss=0.9228998327255249, metrics={'train_runtime': 4461.2396, 'train_samples_per_second': 0.717, 'train_steps_per_second': 0.045, 'total_flos': 104518700236800.0, 'train_loss': 0.9228998327255249, 'epoch': 1.6})

In [None]:
# Columns used in 5f
id_cols      = ["unitid", "year"]
target_col   = "completers"
high_cardols = ["cips"]

metadata_cats = [
    "state_abbr", "inst_control", "urban_centric_locale",
    "inst_size", "cbsa_type", "inst_affiliation"
]

all_cols = ipeds_train.columns.tolist()

base_feature_cols = [
    c for c in all_cols
    if c not in (id_cols + [target_col] + high_cardols + metadata_cats)
]

categorical_model_cols = [c for c in ["region", "sector"] if c in base_feature_cols]
numeric_cols = [c for c in base_feature_cols if c not in categorical_model_cols]

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_model_cols)

# Full columns for synthetic training (include target)
synth_cols = numeric_cols + categorical_model_cols + [target_col]

train_df = ipeds_train[synth_cols].copy()
val_df   = ipeds_val[synth_cols].copy()

print("Training for synthetic:", train_df.shape)


Numeric cols: ['longitude', 'latitude', 'student_faculty_ratio', 'headcount', 'cbsa', 'enrolled_undergrad_fulltime', 'enrolled_undergrad_parttime', 'enrolled_graduate_fulltime', 'enrolled_graduate_parttime']
Categorical cols: ['region', 'sector']
Training for synthetic: (6602, 12)


In [None]:
# --- Columns we want to model synthetically ---
synth_cols = [
    "longitude",
    "latitude",
    "student_faculty_ratio",
    "headcount",
    "cbsa",
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
    "enrolled_graduate_parttime",
    "region",   # will be coded
    "sector",   # will be coded
    "completers",
]

train_df = ipeds_train[synth_cols].copy()

# Make sure numerics are truly numeric
numeric_cols = [
    "longitude",
    "latitude",
    "student_faculty_ratio",
    "headcount",
    "cbsa",
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
    "enrolled_graduate_parttime",
    "completers",
]

for col in numeric_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors="coerce")

# Encode region / sector as categorical integer codes
cat_cols = ["region", "sector"]
for col in cat_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype("category")
        print(f"{col} categories:", train_df[col].cat.categories)
        train_df[col] = train_df[col].cat.codes  # 0,1,2,...

# Let CTGAN/TVAE know which columns are discrete
discrete_cols = [c for c in cat_cols if c in train_df.columns]

print("Final train_df dtypes:")
print(train_df.dtypes)
print("Discrete columns:", discrete_cols)
print("Training DF shape:", train_df.shape)


region categories: Index(['Far West: AK CA HI NV OR and WA', 'Great Lakes: IL IN MI OH and WI',
       'Mid East: DE DC MD NJ NY and PA', 'New England: CT ME MA NH RI and VT',
       'Outlying areas: AS FM GU MH MP PR PW and VI',
       'Plains: IA KS MN MO NE ND and SD',
       'Rocky Mountains: CO ID MT UT and WY',
       'Southeast: AL AR FL GA KY LA MS NC SC TN VA and WV',
       'Southwest: AZ NM OK and TX', 'US service schools'],
      dtype='object')
sector categories: Index(['Private not-for-profit four-year or above', 'Public four-year or above'], dtype='object')
Final train_df dtypes:
longitude                      float64
latitude                       float64
student_faculty_ratio          float64
headcount                      float64
cbsa                           float64
enrolled_undergrad_fulltime      int64
enrolled_undergrad_parttime      int64
enrolled_graduate_fulltime       int64
enrolled_graduate_parttime       int64
region                            int8
sector  

In [None]:
from ctgan import CTGAN

ctgan = CTGAN(
    epochs=50,
    batch_size=128,      # fine
    pac=1,               # <- IMPORTANT: disable packing so batch size is OK
    generator_lr=2e-4,
    discriminator_lr=2e-4,
    verbose=True
)

print("Fitting CTGAN...")
ctgan.fit(train_df, discrete_columns=discrete_cols)

print("Sampling synthetic CTGAN data...")
synthetic_ctgan = ctgan.sample(6602)

ctgan_path = f"{OUT_DIR}/synthetic_ctgan.csv"
synthetic_ctgan.to_csv(ctgan_path, index=False)

print("Saved CTGAN synthetic data →", ctgan_path)
synthetic_ctgan.head()


Fitting CTGAN...


Gen. (-0.14) | Discrim. (-0.12): 100%|██████████| 50/50 [02:19<00:00,  2.80s/it]


Sampling synthetic CTGAN data...
Saved CTGAN synthetic data → /content/drive/MyDrive/dissertation/outputs/ipeds_5g_synth/synthetic_ctgan.csv


Unnamed: 0,longitude,latitude,student_faculty_ratio,headcount,cbsa,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime,enrolled_graduate_parttime,region,sector,completers
0,-71.270021,43.347245,12.811206,-290.941198,41700.621019,0,0,0,0,3,0,-90
1,-88.407861,42.83371,12.926298,2401.556411,29016.036583,0,0,1,1,7,0,-32
2,-97.97299,40.688251,17.006085,30439.932277,28874.788655,1,1,1,1,7,1,6358
3,-74.849261,42.130434,6.299198,2424.000508,35086.295945,1,1,1,1,2,0,305
4,-84.940729,42.951316,12.896107,2188.803325,29983.309592,0,1,1,1,7,0,154


In [None]:
from ctgan.synthesizers import TVAE

tvae = TVAE(
    embedding_dim=128,
    compress_dims=(128, 128),
    decompress_dims=(128, 128),
    l2scale=1e-5,
    batch_size=128,
    epochs=50,       # keep small, consistent with CTGAN
    verbose=True
)

print("Fitting TVAE...")
tvae.fit(train_df, discrete_columns=discrete_cols)

print("Sampling synthetic TVAE data...")
synthetic_tvae = tvae.sample(6602)

tvae_path = f"{OUT_DIR}/synthetic_tvae.csv"
synthetic_tvae.to_csv(tvae_path, index=False)

print("Saved TVAE synthetic data →", tvae_path)
synthetic_tvae.head()


Fitting TVAE...


Loss: -30.111: 100%|██████████| 50/50 [00:38<00:00,  1.31it/s]


Sampling synthetic TVAE data...
Saved TVAE synthetic data → /content/drive/MyDrive/dissertation/outputs/ipeds_5g_synth/synthetic_tvae.csv


Unnamed: 0,longitude,latitude,student_faculty_ratio,headcount,cbsa,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime,enrolled_graduate_parttime,region,sector,completers
0,-82.609564,34.989455,15.071031,11217.550885,19873.340468,1,1,1,1,7,0,2386
1,-79.261245,38.516612,13.02617,580.46742,36520.270578,0,0,1,0,2,0,62
2,-95.376342,33.542134,13.112273,6123.311678,34828.543631,1,1,1,1,8,1,1113
3,-83.888212,36.891188,23.351094,6565.419798,43226.73519,1,1,0,0,7,1,1686
4,-87.779942,36.355592,21.952474,14371.38783,17462.74901,1,1,1,1,7,1,3699


In [None]:
synth_cols = [
    "longitude",
    "latitude",
    "student_faculty_ratio",
    "headcount",
    "cbsa",
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
    "enrolled_graduate_parttime",
    "region",
    "sector",
    "completers",
]


In [None]:
import os
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM

# Optional: mute noisy generation warnings
logging.getLogger("transformers.generation.utils").setLevel(logging.ERROR)

base_model_name = "distilgpt2"

# Fine-tuned checkpoint
ft_model_dir = f"{OUT_DIR}/llm_model_distilgpt2/checkpoint-200"

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Use EOS as PAD and *force left padding* (for decoder-only models)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# --- Model ---
model = AutoModelForCausalLM.from_pretrained(ft_model_dir)
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
import torch
import pandas as pd

model.eval()

N_SYNTH = 3000
BATCH_SIZE = 32
MAX_NEW_TOKENS = 64

synthetic_rows = []

# Safe prompt (at least 1 token)
prompt = tokenizer.bos_token or "."

while len(synthetic_rows) < N_SYNTH:
    batch_n = min(BATCH_SIZE, N_SYNTH - len(synthetic_rows))
    prompts = [prompt] * batch_n

    enc = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,      # left-padding now
        truncation=False
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"],
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=0.9,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
        )

    texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    synthetic_rows.extend(texts)

    print(f"Generated {len(synthetic_rows)}/{N_SYNTH} rows...", end="\r")

synthetic_rows = synthetic_rows[:N_SYNTH]
print("\nDone generating LLM synthetic rows!")


Generated 3000/3000 rows...
Done generating LLM synthetic rows!


In [None]:
import re
import numpy as np
import pandas as pd

synth_cols = [
    "longitude",
    "latitude",
    "student_faculty_ratio",
    "headcount",
    "cbsa",
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
    "enrolled_graduate_parttime",
    "region",
    "sector",
    "completers",
]

def parse_numeric_list(text, n_cols):
    # grab all integers / floats (including negatives)
    nums = re.findall(r"[-+]?\d*\.?\d+", text)
    # filter out empty matches
    nums = [n for n in nums if n not in ["", ".", "+", "-"]]
    if len(nums) < n_cols:
        return None
    nums = nums[:n_cols]
    return [float(x) for x in nums]

records = []
for t in synthetic_rows:
    rec = parse_numeric_list(t, len(synth_cols))
    if rec is not None:
        records.append(rec)

df_llm = pd.DataFrame(records, columns=synth_cols)
print("Parsed rows:", len(df_llm))
df_llm.head()


Parsed rows: 0


Unnamed: 0,longitude,latitude,student_faculty_ratio,headcount,cbsa,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime,enrolled_graduate_parttime,region,sector,completers


In [None]:
# force numeric for all
for c in df_llm.columns:
    df_llm[c] = pd.to_numeric(df_llm[c], errors="coerce")

# binary-ish columns: round & clip to {0,1}
bin_cols = [
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
    "enrolled_graduate_parttime",
]

for c in bin_cols:
    df_llm[c] = df_llm[c].round().clip(0, 1).astype(int)

# region, sector: small categories, just round & clip
df_llm["region"] = df_llm["region"].round().clip(0, 9).astype(int)
df_llm["sector"] = df_llm["sector"].round().clip(0, 3).astype(int)

df_llm.head()


Unnamed: 0,longitude,latitude,student_faculty_ratio,headcount,cbsa,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime,enrolled_graduate_parttime,region,sector,completers


In [None]:
for i, t in enumerate(synthetic_rows[:20]):
    print(f"[{i}] {repr(t)}\n")


[0] 'itude: -88.94217, latitude: 42.792472, student_faculty_ratio: 13.0, headcount: 2689.0, cbsa: 41080.0, undergrad_ft: 1, undergrad_pt: 1, grad_ft: 1,'

[1] 'itude: -81.804582, latitude: 38.168051, student_faculty_ratio: 11.0, headcount: 1128.0, cbsa: 33460.0, undergrad_ft: 1, undergrad_pt: 1, grad_ft: 1,'

[2] 'itude: -66.686875, latitude: 40.397467, student_faculty_ratio: 13.0, headcount: 2689.0, cbsa: 24140.0, undergrad_ft: 1, undergrad_pt: 1, grad_ft: 1,'

[3] 'itude: -82.246924, latitude: 37.620123, student_faculty_ratio: 11.0, headcount: 1546.0, cbsa: 10540.0, undergrad_ft: 1, undergrad_pt: 1, grad_ft: 1,'

[4] 'itude: -94.96746, latitude: 33.023522, student_faculty_ratio: 14.0, headcount: 889.0, cbsa: 36620.0, undergrad_ft: 1, undergrad_pt: 1, grad_ft: 0,'

[5] 'itude: -87.041679, latitude: 32.81525, student_faculty_ratio: 13.0, headcount: 1814.0, cbsa: 33140.0, undergrad_ft: 1, undergrad_pt: 1, grad_ft: 1,'

[6] 'itude: -85.8814092, latitude: 41.026897, student_faculty_ratio:

In [None]:
import re
import numpy as np
import pandas as pd

# These are the 8 fields the LLM is clearly generating
llm_core_cols = [
    "longitude",
    "latitude",
    "student_faculty_ratio",
    "headcount",
    "cbsa",
    "enrolled_undergrad_fulltime",
    "enrolled_undergrad_parttime",
    "enrolled_graduate_fulltime",
]

def parse_llm_core(text):
    # grab all numbers (ints/floats, including negatives)
    nums = re.findall(r"[-+]?\d*\.?\d+", text)
    nums = [n for n in nums if n not in ["", ".", "+", "-"]]

    # we expect at least 8 numbers per row
    if len(nums) < len(llm_core_cols):
        return None

    nums = nums[:len(llm_core_cols)]
    return [float(x) for x in nums]

records = []
for t in synthetic_rows:
    rec = parse_llm_core(t)
    if rec is not None:
        records.append(rec)

df_llm_core = pd.DataFrame(records, columns=llm_core_cols)
print("Parsed LLM rows:", df_llm_core.shape)
df_llm_core.head()


Parsed LLM rows: (2957, 8)


Unnamed: 0,longitude,latitude,student_faculty_ratio,headcount,cbsa,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime
0,-88.94217,42.792472,13.0,2689.0,41080.0,1.0,1.0,1.0
1,-81.804582,38.168051,11.0,1128.0,33460.0,1.0,1.0,1.0
2,-66.686875,40.397467,13.0,2689.0,24140.0,1.0,1.0,1.0
3,-82.246924,37.620123,11.0,1546.0,10540.0,1.0,1.0,1.0
4,-94.96746,33.023522,14.0,889.0,36620.0,1.0,1.0,0.0


In [None]:
# Make sure ipeds_train is already loaded:
# ipeds_train = pd.read_csv(f"{DATA_DIR}/ipeds_train.csv")

needed_cols = ["region", "sector", "completers"]
assert all(c in ipeds_train.columns for c in needed_cols)

n_rows = len(df_llm_core)
sampled = ipeds_train[needed_cols].sample(
    n=n_rows,
    replace=True,
    random_state=42
).reset_index(drop=True)

df_llm_full = pd.concat([df_llm_core.reset_index(drop=True), sampled], axis=1)

print("Final LLM synthetic df shape:", df_llm_full.shape)
df_llm_full.head()


Final LLM synthetic df shape: (2957, 11)


Unnamed: 0,longitude,latitude,student_faculty_ratio,headcount,cbsa,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime,region,sector,completers
0,-88.94217,42.792472,13.0,2689.0,41080.0,1.0,1.0,1.0,Mid East: DE DC MD NJ NY and PA,Private not-for-profit four-year or above,7
1,-81.804582,38.168051,11.0,1128.0,33460.0,1.0,1.0,1.0,Far West: AK CA HI NV OR and WA,Public four-year or above,5199
2,-66.686875,40.397467,13.0,2689.0,24140.0,1.0,1.0,1.0,Great Lakes: IL IN MI OH and WI,Private not-for-profit four-year or above,125
3,-82.246924,37.620123,11.0,1546.0,10540.0,1.0,1.0,1.0,Plains: IA KS MN MO NE ND and SD,Private not-for-profit four-year or above,103
4,-94.96746,33.023522,14.0,889.0,36620.0,1.0,1.0,0.0,New England: CT ME MA NH RI and VT,Private not-for-profit four-year or above,1919


In [None]:
# --- CLEAN COMPLETERS ---
df_llm_full["completers"] = pd.to_numeric(df_llm_full["completers"], errors="coerce")

# Replace missing with median from REAL IPEDS
median_comp = ipeds_train["completers"].median()
df_llm_full["completers"] = df_llm_full["completers"].fillna(median_comp)

# --- SAVE CLEANED FILE ---
llm_path = f"{OUT_DIR}/synthetic_llm.csv"
df_llm_full.to_csv(llm_path, index=False)

print("Saved cleaned LLM synthetic data →", llm_path)

df_llm_full.head()


Saved cleaned LLM synthetic data → /content/drive/MyDrive/dissertation/outputs/ipeds_5g_synth/synthetic_llm.csv


Unnamed: 0,longitude,latitude,student_faculty_ratio,headcount,cbsa,enrolled_undergrad_fulltime,enrolled_undergrad_parttime,enrolled_graduate_fulltime,region,sector,completers
0,-88.94217,42.792472,13.0,2689.0,41080.0,1.0,1.0,1.0,Mid East: DE DC MD NJ NY and PA,Private not-for-profit four-year or above,7
1,-81.804582,38.168051,11.0,1128.0,33460.0,1.0,1.0,1.0,Far West: AK CA HI NV OR and WA,Public four-year or above,5199
2,-66.686875,40.397467,13.0,2689.0,24140.0,1.0,1.0,1.0,Great Lakes: IL IN MI OH and WI,Private not-for-profit four-year or above,125
3,-82.246924,37.620123,11.0,1546.0,10540.0,1.0,1.0,1.0,Plains: IA KS MN MO NE ND and SD,Private not-for-profit four-year or above,103
4,-94.96746,33.023522,14.0,889.0,36620.0,1.0,1.0,0.0,New England: CT ME MA NH RI and VT,Private not-for-profit four-year or above,1919
