In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy.stats import permutation_test, f_oneway
from sklearn.neural_network import MLPRegressor

In [2]:
train = pd.read_csv('../DataCleaning/train.csv')
train = train.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])

train['race_condensed'] = train['race_condensed'].fillna('Missing')

In [3]:
train['tokenized_cp'] = train['chiefcomplaint'].str.lower().str.split()

In [4]:
word2vec_model = Word2Vec(sentences=train["tokenized_cp"], vector_size=100, window=5, min_count=1, workers=4)

In [5]:
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        return np.array([self._get_sentence_embedding(words) for words in X])

    def _get_sentence_embedding(self, words):
        vectors = [self.model.wv[word] for word in words if word in self.model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(self.vector_size)

In [6]:
# text_pipeline = FeatureUnion([
#     ("tfidf", TfidfVectorizer()),  # TF-IDF feature extraction
#     ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec feature extraction
# ])
# text_pipeline = FeatureUnion([  # TF-IDF feature extraction
#     ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec feature extraction
# ])
# text_pipeline = FeatureUnion([
#     ("tfidf", TfidfVectorizer())  # Word2Vec feature extraction
# ])

In [7]:
# Define feature lists
cc_vars = [f"cc_{i}" for i in range(100)]  # These should be passed unchanged
numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
                'sbp', 'dbp', 'acuity', 'pain_cleaned_advanced']
categorical_vars = ['gender', 'arrival_transport', 'race_condensed']

# Define preprocessing pipelines
numeric_pipeline = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
    ('scaler', StandardScaler())  # Standardize only selected numeric variables
])

cc_pipeline = FunctionTransformer(lambda x: x, validate=False)  # Pass through unchanged

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define ColumnTransformer
# impute_standardize = ColumnTransformer(
#     transformers=[
#         ('num', numeric_pipeline, numeric_vars),  # Standardize selected numeric features
#         ('cc', cc_pipeline, cc_vars),  # Pass `cc_*` variables unchanged
#         ('cat', categorical_pipeline, categorical_vars),
#         ("text", TfidfVectorizer(), "chiefcomplaint"),
#         ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")
#     ]
# )
impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_vars),
        ('cat', categorical_pipeline, categorical_vars),
        ("text", TfidfVectorizer(), "chiefcomplaint"),
        ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")
    ]
)
model = Pipeline(steps=[("pre", impute_standardize), ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])
# model = Pipeline(steps=[("pre", impute_standardize), ("model", LinearRegression())])
# model = Pipeline(steps=[("pre", impute_standardize), ("model", Ridge())])

In [8]:
train['race_condensed'] = train['race_condensed'].fillna('Missing')

# numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
#                 'sbp', 'dbp', 'acuity', 'stay_length_minutes', 'pain_cleaned_advanced']
numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
                'sbp', 'dbp', 'acuity', 'pain_cleaned_advanced'] + [f"cc_{i}" for i in range(100)]
categorical_vars = ['gender', 'arrival_transport', 'race_condensed']

numeric = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
    ('scaler', StandardScaler())
])

categorical = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric, numeric_vars),
        ('cat', categorical, categorical_vars),
        ("text", TfidfVectorizer(), "chiefcomplaint"),
        ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")

    ])
impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric, numeric_vars),
        ('cat', categorical, categorical_vars)

    ])
# model = Pipeline(steps=[("pre", impute_standardize), ("text", FeatureUnion([
#             ("tfidf", TfidfVectorizer()),  # TF-IDF vectorization
#             ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec embeddings
#         ]), "chiefcomplaint")])
# # model = Pipeline(steps=[("pre", impute_standardize), ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])
# model = Pipeline(steps=[("pre", impute_standardize), ("model", LinearRegression())])
# # model = Pipeline(steps=[("pre", impute_standardize), ("model", Ridge())])

In [9]:
# X = train.drop(columns=(['stay_length_minutes', 'tokenized_cp'] + [f"cc_{i}" for i in range(100)]))
X = train.drop(columns=(['stay_length_minutes', 'tokenized_cp', 'chiefcomplaint']))

In [None]:
import torch
for col in X.select_dtypes(include=['float64', 'int64']).columns:
    X[col].fillna(X[col].mean(), inplace=True)

for col in X.select_dtypes(include='object').columns:
    X[col].fillna(X[col].mode()[0], inplace=True)

df_encoded = pd.get_dummies(X, columns=['gender', 'arrival_transport', 'race_condensed'])
# 
X_tensor = torch.tensor(df_encoded.astype(float).values, dtype=torch.float)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mode()[0], inplace=True)


In [11]:
print(X_tensor.size())

torch.Size([368975, 124])


In [12]:
y = train['stay_length_minutes']
y=y.to_numpy()
y_tensor = torch.from_numpy(y)
print(y_tensor.size())

torch.Size([368975])


In [13]:
test = pd.read_csv('../DataCleaning/test.csv')
test = test.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])

test['race_condensed'] = test['race_condensed'].fillna('Missing')

In [14]:
# X_test = test.drop(columns=(['stay_length_minutes'] + [f"cc_{i}" for i in range(100)]))
X_test = test.drop(columns=(['stay_length_minutes', 'chiefcomplaint']))
y_test  = test['stay_length_minutes']


In [None]:
for col in X_test.select_dtypes(include=['float64', 'int64']).columns:
    X_test[col].fillna(X_test[col].mean(), inplace=True)

for col in X_test.select_dtypes(include='object').columns:
    X_test[col].fillna(X_test[col].mode()[0], inplace=True)

df_encoded = pd.get_dummies(X_test, columns=['gender', 'arrival_transport', 'race_condensed'])
# 
X_test_tensor = torch.tensor(df_encoded.astype(float).values, dtype=torch.float)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(X_test[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(X_test[col].mode()[0], inplace=True)


In [16]:
print(X_test_tensor.size())

torch.Size([40998, 124])


In [17]:
y_test=y_test.to_numpy()
y_test_tensor = torch.from_numpy(y_test)
print(y_test_tensor.size())

torch.Size([40998])


In [None]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super(MLP, self).__init__()
        layers = []

        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, 1))  # Final regression output
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


In [None]:
def rmse_loss(pred, target):
    return torch.sqrt(torch.mean((pred - target) ** 2))

# Make sure y is of shape [n, 1]
if y_tensor.ndim == 1:
    y_tensor = y_tensor.unsqueeze(1)
if y_test_tensor.ndim == 1:
    y_test_tensor = y_test_tensor.unsqueeze(1)

from torch.utils.data import TensorDataset, DataLoader
batch_size = 512
train_dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

input_dim = X_tensor.shape[1]
hidden_dims = [512, 512, 512, 512, 512, 512]
model = MLP(input_dim=X_tensor.shape[1], hidden_dims=hidden_dims)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        pred = model(batch_X)
        loss = rmse_loss(pred, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_X.size(0)
    
    avg_train_loss = total_loss / len(train_loader.dataset)

    model.eval()
    with torch.no_grad():
        test_preds = model(X_test_tensor)
        test_loss = rmse_loss(test_preds, y_test_tensor).item()

    if (epoch + 1) % 1 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{num_epochs} | Train RMSE: {avg_train_loss:.4f} | Test RMSE: {test_loss:.4f}")

Epoch 1/100 | Train RMSE: 375.7097 | Test RMSE: 378.8504
Epoch 2/100 | Train RMSE: 366.1204 | Test RMSE: 370.6698
Epoch 3/100 | Train RMSE: 364.5161 | Test RMSE: 377.2680
Epoch 4/100 | Train RMSE: 363.3368 | Test RMSE: 369.1485
Epoch 5/100 | Train RMSE: 362.4412 | Test RMSE: 369.0166
Epoch 6/100 | Train RMSE: 361.9067 | Test RMSE: 367.9326
Epoch 7/100 | Train RMSE: 361.0014 | Test RMSE: 368.1744
Epoch 8/100 | Train RMSE: 361.2767 | Test RMSE: 367.1302
Epoch 9/100 | Train RMSE: 360.6335 | Test RMSE: 369.0350
Epoch 10/100 | Train RMSE: 359.9389 | Test RMSE: 367.9157
Epoch 11/100 | Train RMSE: 360.2449 | Test RMSE: 368.7642
Epoch 12/100 | Train RMSE: 359.9060 | Test RMSE: 367.3842
Epoch 13/100 | Train RMSE: 359.8822 | Test RMSE: 367.7858
Epoch 14/100 | Train RMSE: 359.3370 | Test RMSE: 367.0089
Epoch 15/100 | Train RMSE: 359.2438 | Test RMSE: 366.0707
Epoch 16/100 | Train RMSE: 359.3835 | Test RMSE: 368.3567
Epoch 17/100 | Train RMSE: 358.8305 | Test RMSE: 367.0146
Epoch 18/100 | Train RM