In [1]:
import pandas as pd
import yaml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

Configuration & Data Loading

In [2]:
with open("../data/data.yaml", "r") as f:
    config = yaml.safe_load(f)

In [3]:
# Define Features from Scorecard (X)
# Keys match your data.yaml structure
scorecard_features_map = {
    'school_name': 'school.name',
    'state': 'school.state',
    'control': 'school.ownership',           # 1=Public, 2=Private
    'tuition_in_state': 'cost.tuition.in_state',
    'sat_avg': 'admissions.sat_scores.average.overall',
    'pell_grant_rate': 'aid.pell_grant_rate',
    'faculty_salary': 'school.faculty_salary',
    'unitid': 'id'                           # CRITICAL: Join Key
}

In [4]:
# Extract raw CSV column names from YAML
use_cols_scorecard = {}
for alias, yaml_key in scorecard_features_map.items():
    if yaml_key in config['dictionary']:
        raw_col = config['dictionary'][yaml_key]['source']
        use_cols_scorecard[raw_col] = alias

In [5]:
# Load College Scorecard
df_sc = pd.read_csv(
    "../data/MERGED2023_24_PP.csv", 
    usecols=use_cols_scorecard.keys(), 
    na_values=config['null_value']
)
df_sc.rename(columns=use_cols_scorecard, inplace=True)

In [6]:
# Load IPEDS Admissions Data (For Target)
# Columns: UNITID (Join Key), ADMSSN (Admitted), ENRLT (Enrolled)
df_ipeds = pd.read_csv("../data/ADM2024.csv")
df_ipeds = df_ipeds[['UNITID', 'ADMSSN', 'ENRLT']]

In [7]:
# Inner join: We only want schools that appear in BOTH datasets
df_final = pd.merge(df_sc, df_ipeds, left_on='unitid', right_on='UNITID', how='inner')

In [8]:
# Calculate Yield: Enrolled / Admitted
df_final['YIELD'] = df_final['ENRLT'] / df_final['ADMSSN']

In [9]:
# Data Cleaning
df_final = df_final[df_final['ADMSSN'] > 0]   # Avoid divide by zero
df_final = df_final[df_final['YIELD'] <= 1.0] # Remove invalid data (Yield > 100%)
df_final = df_final.dropna(subset=['YIELD'])  # Drop missing targets

In [None]:
numeric_features = ['tuition_in_state', 'sat_avg', 'pell_grant_rate', 'faculty_salary']
categorical_features = ['state', 'control']