In [150]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy.stats import permutation_test, f_oneway
from sklearn.neural_network import MLPRegressor

In [151]:
train = pd.read_csv('DataCleaning/train.csv')
train = train.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])

train['race_condensed'] = train['race_condensed'].fillna('Missing')

In [152]:
train['tokenized_cp'] = train['chiefcomplaint'].str.lower().str.split()

In [153]:
word2vec_model = Word2Vec(sentences=train["tokenized_cp"], vector_size=100, window=5, min_count=1, workers=4)

In [154]:
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        return np.array([self._get_sentence_embedding(words) for words in X])

    def _get_sentence_embedding(self, words):
        vectors = [self.model.wv[word] for word in words if word in self.model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(self.vector_size)

In [77]:
# text_pipeline = FeatureUnion([
#     ("tfidf", TfidfVectorizer()),  # TF-IDF feature extraction
#     ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec feature extraction
# ])
# text_pipeline = FeatureUnion([  # TF-IDF feature extraction
#     ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec feature extraction
# ])
# text_pipeline = FeatureUnion([
#     ("tfidf", TfidfVectorizer())  # Word2Vec feature extraction
# ])

In [221]:
# Define feature lists
cc_vars = [f"cc_{i}" for i in range(100)]  # These should be passed unchanged
numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
                'sbp', 'dbp', 'acuity', 'pain_cleaned_advanced']
categorical_vars = ['gender', 'arrival_transport', 'race_condensed']

# Define preprocessing pipelines
numeric_pipeline = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
    ('scaler', StandardScaler())  # Standardize only selected numeric variables
])

cc_pipeline = FunctionTransformer(lambda x: x, validate=False)  # Pass through unchanged

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define ColumnTransformer
# impute_standardize = ColumnTransformer(
#     transformers=[
#         ('num', numeric_pipeline, numeric_vars),  # Standardize selected numeric features
#         ('cc', cc_pipeline, cc_vars),  # Pass `cc_*` variables unchanged
#         ('cat', categorical_pipeline, categorical_vars),
#         ("text", TfidfVectorizer(), "chiefcomplaint"),
#         ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")
#     ]
# )
impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_vars),
        ('cat', categorical_pipeline, categorical_vars),
        ("text", TfidfVectorizer(), "chiefcomplaint"),
        ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")
    ]
)
model = Pipeline(steps=[("pre", impute_standardize), ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])
# model = Pipeline(steps=[("pre", impute_standardize), ("model", LinearRegression())])
# model = Pipeline(steps=[("pre", impute_standardize), ("model", Ridge())])

In [144]:
# train['race_condensed'] = train['race_condensed'].fillna('Missing')

# # numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
# #                 'sbp', 'dbp', 'acuity', 'stay_length_minutes', 'pain_cleaned_advanced']
# numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
#                 'sbp', 'dbp', 'acuity', 'pain_cleaned_advanced'] + [f"cc_{i}" for i in range(100)]
# categorical_vars = ['gender', 'arrival_transport', 'race_condensed']

# numeric = Pipeline(steps=[
#     ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
#     ('scaler', StandardScaler())
# ])

# categorical = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# impute_standardize = ColumnTransformer(
#     transformers=[
#         ('num', numeric, numeric_vars),
#         ('cat', categorical, categorical_vars),
#         ("text", TfidfVectorizer(), "chiefcomplaint"),
#         ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")

#     ])
# impute_standardize = ColumnTransformer(
#     transformers=[
#         ('num', numeric, numeric_vars),
#         ('cat', categorical, categorical_vars)

#     ])
# # model = Pipeline(steps=[("pre", impute_standardize), ("text", FeatureUnion([
# #             ("tfidf", TfidfVectorizer()),  # TF-IDF vectorization
# #             ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec embeddings
# #         ]), "chiefcomplaint"), ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])
# # model = Pipeline(steps=[("pre", impute_standardize), ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])
# model = Pipeline(steps=[("pre", impute_standardize), ("model", LinearRegression())])
# # model = Pipeline(steps=[("pre", impute_standardize), ("model", Ridge())])

In [188]:
# X = train.drop(columns=(['stay_length_minutes', 'tokenized_cp'] + [f"cc_{i}" for i in range(100)]))
X = train.drop(columns=(['stay_length_minutes', 'tokenized_cp']))

In [189]:
y = train['stay_length_minutes']

In [222]:
model.fit(X, y)

In [223]:
y_pred = model.predict(X)

In [46]:
r2_score(y, y_pred)

0.15016780231584448

In [224]:
np.sqrt(mean_squared_error(y, y_pred))

356.70818844030697

In [None]:
pd.Series(np.abs(y - y_pred)).describe()

count    370197.000000
mean        224.565150
std         300.724702
min           0.000126
25%          74.263515
50%         154.349209
75%         261.008386
max        8300.554503
Name: stay_length_minutes, dtype: float64

In [192]:
test = pd.read_csv('DataCleaning/test.csv')
test = test.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])

test['race_condensed'] = test['race_condensed'].fillna('Missing')

In [228]:
# X_test = test.drop(columns=(['stay_length_minutes'] + [f"cc_{i}" for i in range(100)]))
X_test = test.drop(columns=(['stay_length_minutes']))
y_test  = test['stay_length_minutes']
y_pred_test = model.predict(X_test)

In [229]:
np.sqrt(mean_squared_error(y_test, y_pred_test))

368.27857666146093

In [20]:
r2_score(y_true=y_test, y_pred=y_pred_test)

0.14370312473917968

In [226]:
np.sqrt(mean_squared_error(y_test, np.array([y.mean()] * y_test.shape[0])))

399.0884562565663

In [None]:
# group1 = train[train['gender'] == "F"]['stay_length_minutes']
# group2 = train[train['gender'] == "M"]['stay_length_minutes']

In [None]:
group1 = train[train['gender'] == "F"]['stay_length_minutes']
group2 = train[train['gender'] == "M"]['stay_length_minutes']

In [None]:
def statistic(x, y):
    return np.mean(x) - np.mean(y)

# Perform permutation test
result = permutation_test(
    (group1, group2), 
    statistic,
    permutation_type='independent',  # For independent samples
    n_resamples=500,               # Number of permutations
    alternative='two-sided'          # Test direction
)

print(f"Observed Statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed Statistic: 0.7621521364051205
P-value: 0.562874251497006


In [None]:
groups = [train[train['arrival_transport'] == cat]['stay_length_minutes'].values for cat in train['arrival_transport'].unique()]

# Define the F-statistic as the test statistic
def f_statistic(*groups):
    return f_oneway(*groups).statistic

# Perform permutation test
result = permutation_test(
    groups,
    f_statistic,
    permutation_type='independent', 
    n_resamples=500,              # Adjust based on computational limits
    alternative='greater'            # ANOVA is one-tailed (test for larger F)
)

In [None]:
print(f"Observed F-statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed F-statistic: 2047.824460047073
P-value: 0.001996007984031936
