### EduSpend CP#30: Global Higher-Education Cost Analytics & Planning
Model Comparison Provided by Cursor

This code compares the performance of different models (Linear Reg, Random Forest, SVR, LightGBM, XGBoost)  
on a dataset related to global higher-education costs, using Mean Squared Error and R^2 Score.  
The dataset is assumed to be preprocessed and available as 'edu_cost_data'.

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [3]:
# Load the dataset, Total Cost of Attendance
edu_cost_data = pd.read_csv('../data/TCA_no_outliers.csv')

In [4]:
# Split the data into features and target variable
X = edu_cost_data.drop('TCA', axis=1)
y = edu_cost_data['TCA']

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

*Feature Engineering: Low, Medium, High TCA*

In [6]:
# Create TCA tiers based on training data quantiles
tca_quantiles = y_train.quantile([0.33, 0.67])

In [7]:
# Define tier boundaries based on training data quantiles
train_tca_tiers = pd.cut(y_train, bins=[0, tca_quantiles[0.33], tca_quantiles[0.67], float('inf')],
                     labels=['Low', 'Medium', 'High'])
test_tca_tiers = pd.cut(y_test, bins=[0, tca_quantiles[0.33], tca_quantiles[0.67], float('inf')],
                    labels=['Low', 'Medium', 'High'])

print(f"TCA Tier Boundaries (based on training data):")
print(f"Low, 33rd percentile: < ${tca_quantiles[0.33]:,.0f}")
print(f"Medium, 33rd - 67th percentile: ${tca_quantiles[0.33]:,.0f} - ${tca_quantiles[0.67]:,.0f}")
print(f"High, 67th percentile: > ${tca_quantiles[0.67]:,.0f}")


TCA Tier Boundaries (based on training data):
Low, 33rd percentile: < $27,008
Medium, 33rd - 67th percentile: $27,008 - $62,960
High, 67th percentile: > $62,960


In [8]:
# Add tiers to your feature sets
X_train['TCA_Tier'] = train_tca_tiers
X_test['TCA_Tier'] = test_tca_tiers

*Encoding*

In [9]:
# Convert TCA tiers to numerical order
# Define categories in the correct order: 0, 1, 2
categories = [['Low', 'Medium', 'High']]  # Note: nested list

oe = OrdinalEncoder(categories=categories)
X_train['TCA_Tier_Encoded'] = oe.fit_transform(X_train[['TCA_Tier']])
X_test['TCA_Tier_Encoded'] = oe.transform(X_test[['TCA_Tier']])


In [10]:
# One hot encode Level (degree) types
level_dummies = pd.get_dummies(X_train['Level'], prefix='Level')
X_train = pd.concat([X_train, level_dummies], axis=1)

level_dummies_test = pd.get_dummies(X_test['Level'], prefix='Level')
X_test = pd.concat([X_test, level_dummies_test], axis=1)

In [11]:
# Using Target Encoding due to high number of unique universities, cities, and countries
from category_encoders import TargetEncoder

categorical_cols = ['City', 'University', 'Country', 'Program']
encoder = TargetEncoder(cols=categorical_cols)

# Fit on training data only
X_train_encoded = encoder.fit_transform(X_train[categorical_cols], y_train)
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Add encoded columns
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# Drop original categorical columns
X_train = X_train.drop(categorical_cols, axis=1)
X_test = X_test.drop(categorical_cols, axis=1)

In [None]:
# Drop remaining categorical columns
X_train = X_train.drop(columns=['Level', 'TCA_Tier'])
X_test = X_test.drop(columns=['Level', 'TCA_Tier'])


*Scaling*

In [15]:
# Identify columns to scale (exclude one-hot and ordinal encoded)
cols_to_scale = X_train.select_dtypes(include=['float64', 'int64']).columns
cols_to_scale = cols_to_scale.drop(['TCA_Tier_Encoded'])

scaler = StandardScaler()
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

*Models*

In [25]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regressor': SVR(C=100, epsilon=0.1, kernel='linear', gamma='auto'),
    'LightGBM Regressor ': LGBMRegressor(verbose=-1),
    'XGBoost Regressor': XGBRegressor()
}

In [29]:
# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    results[name] = {
        'Mean Squared Error': mse,
        'R^2 Score': r2
    }

# Display results
results_df = pd.DataFrame(results).T
print(results_df)


                          Mean Squared Error  R^2 Score
Linear Regression               3.361692e+07   0.965518
Random Forest                   6.157302e+06   0.993684
Support Vector Regressor        4.595696e+07   0.952861
LightGBM Regressor              7.967424e+06   0.991828
XGBoost Regressor               4.416544e+06   0.995470
