In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy.stats import permutation_test, f_oneway
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt

In [2]:
# preprocessing from haley's code
train = pd.read_csv('DataCleaning/train.csv')
train = train.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])
train['race_condensed'] = train['race_condensed'].fillna('Missing')

In [3]:
train.shape[0]

368975

In [4]:
# word2vec training

train['tokenized_cp'] = train['chiefcomplaint'].str.lower().str.split()
word2vec_model = Word2Vec(sentences=train["tokenized_cp"], vector_size=100, window=5, min_count=1, workers=4)

In [5]:
# word2vec module

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y=None):
        return self 

    def transform(self, X):
        return np.array([self._get_sentence_embedding(words) for words in X])

    def _get_sentence_embedding(self, words):
        vectors = [self.model.wv[word] for word in words if word in self.model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(self.vector_size)

In [6]:
# Define feature lists
cc_vars = [f"cc_{i}" for i in range(100)]  # These should be passed unchanged
triage_physical_features = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']
other_numeric_vars = ['admission_age', 'acuity', 'pain_cleaned_advanced']
categorical_vars = ['gender', 'arrival_transport', 'race_condensed']


## Standard numeric processing (excluding triage features)
numeric_pipeline = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
    ('scaler', StandardScaler())  # Standardize only selected numeric variables
])

## Triage feature processing (imputation -> polynomial features -> scaling)
triage_pipeline = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),  # Impute first
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Generate squared features
    ('scaler', StandardScaler())  # Standardize after polynomial expansion
])

## Categorical feature processing
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Pass through `cc_*` features unchanged
cc_pipeline = FunctionTransformer(lambda x: x, validate=False)  

# Define ColumnTransformer
impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, other_numeric_vars),  # Standardize selected numeric features
        ('triage', triage_pipeline, triage_physical_features),  # Apply polynomial expansion to triage features
        ('cc', cc_pipeline, cc_vars),  # Pass `cc_*` variables unchanged
        ('cat', categorical_pipeline, categorical_vars),
        ("text", TfidfVectorizer(), "chiefcomplaint"),
        ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")
    ],
    remainder='drop'  # Drop any unassigned columns
)


# Define the model pipeline
model = Pipeline(steps=[
    ("pre", impute_standardize),
    ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))
])

In [7]:
# add LLM features to the training data
X = train.drop(columns=(['stay_length_minutes', 'tokenized_cp']))

In [8]:
y = train['stay_length_minutes']

In [9]:
model.fit(X, y)

In [10]:
y_pred = model.predict(X)

In [11]:
r2_score(y, y_pred)

0.18870925960339924

In [12]:
# training error measured by RMSE
np.sqrt(mean_squared_error(y, y_pred))

354.2131976259213

In [13]:
# prepare test data and y_test
test = pd.read_csv('DataCleaning/test.csv')
test = test.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])
test['race_condensed'] = test['race_condensed'].fillna('Missing')
X_test = test.drop(columns=(['stay_length_minutes']))
y_test  = test['stay_length_minutes']
y_pred_test = model.predict(X_test)

In [14]:
# test error measured by RMSE
np.sqrt(mean_squared_error(y_test, y_pred_test))

365.2970605661623

In [15]:
np.sqrt(((train['stay_length_minutes'].mean() - y_test)**2).mean())

399.0884562565663

In [16]:
r2_score(y_true=y_test, y_pred=y_pred_test)

0.16217219309822262