In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy.stats import permutation_test, f_oneway
from sklearn.neural_network import MLPRegressor

In [2]:
train = pd.read_csv('DataCleaning/train.csv')
train = train.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])

train['race_condensed'] = train['race_condensed'].fillna('Missing')

In [3]:
train['tokenized_cp'] = train['chiefcomplaint'].str.lower().str.split()

In [4]:
word2vec_model = Word2Vec(sentences=train["tokenized_cp"], vector_size=100, window=5, min_count=1, workers=4)

In [5]:
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        return np.array([self._get_sentence_embedding(words) for words in X])

    def _get_sentence_embedding(self, words):
        vectors = [self.model.wv[word] for word in words if word in self.model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(self.vector_size)

In [6]:
text_pipeline = FeatureUnion([
    ("tfidf", TfidfVectorizer()),  # TF-IDF feature extraction
    ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec feature extraction
])

In [20]:
train['race_condensed'] = train['race_condensed'].fillna('Missing')

# numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
#                 'sbp', 'dbp', 'acuity', 'stay_length_minutes', 'pain_cleaned_advanced']
numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
                'sbp', 'dbp', 'acuity', 'pain_cleaned_advanced']
categorical_vars = ['gender', 'arrival_transport', 'race_condensed']

numeric = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
    ('scaler', StandardScaler())
])

categorical = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric, numeric_vars),
        ('cat', categorical, categorical_vars),
        ("text", TfidfVectorizer(), "chiefcomplaint"),
        ("word2vec", Word2VecTransformer(model=word2vec_model), "chiefcomplaint")

    ])
# model = Pipeline(steps=[("pre", impute_standardize), ("text", FeatureUnion([
#             ("tfidf", TfidfVectorizer()),  # TF-IDF vectorization
#             ("word2vec", Word2VecTransformer(model=word2vec_model))  # Word2Vec embeddings
#         ]), "chiefcomplaint"), ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])
model = Pipeline(steps=[("pre", impute_standardize), ("model", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))])

In [21]:
train.head(5)

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,...,cc_91,cc_92,cc_93,cc_94,cc_95,cc_96,cc_97,cc_98,cc_99,tokenized_cp
0,M,WALK IN,63.0,98.4,95.0,,99.0,146.0,73.0,3.0,...,0.080886,-0.002972,-0.030255,-0.025612,-0.047326,-0.055742,0.043237,-0.115148,0.012685,[headache]
1,M,AMBULANCE,72.0,97.8,84.0,20.0,99.0,138.0,91.0,2.0,...,0.03456,0.037256,0.639393,-0.359927,-0.313329,0.309991,0.262597,0.284354,0.280713,"[palps, resolved]"
2,F,AMBULANCE,75.0,98.0,72.0,,96.0,133.0,62.0,2.0,...,-0.104483,-0.034958,-0.203249,-0.079202,0.032316,0.127788,-0.032426,0.01115,0.192132,"[s/p, fall,, sdh,, r, thumb, injury]"
3,M,UNKNOWN,60.0,97.7,94.0,,100.0,149.0,100.0,3.0,...,-0.191968,-0.11196,-0.013973,0.037628,0.029144,-0.02442,0.042527,0.045887,-0.045725,"[r, elbow, pain]"
4,M,WALK IN,49.0,98.0,86.0,,97.0,151.0,74.0,3.0,...,-0.042773,0.016004,0.009657,-0.158851,0.046155,-0.19999,-0.006561,0.086824,-0.141093,[n/v]


In [22]:
X = train.drop(columns=(['stay_length_minutes', 'tokenized_cp'] + [f"cc_{i}" for i in range(100)]))

In [10]:
X

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,chiefcomplaint,pain_cleaned_advanced,race_condensed
0,M,WALK IN,63.0,98.4,95.0,,99.0,146.0,73.0,3.0,Headache,7.0,BLACK
1,M,AMBULANCE,72.0,97.8,84.0,20.0,99.0,138.0,91.0,2.0,PALPS RESOLVED,0.0,White
2,F,AMBULANCE,75.0,98.0,72.0,,96.0,133.0,62.0,2.0,"s/p Fall, SDH, R Thumb injury",2.0,White
3,M,UNKNOWN,60.0,97.7,94.0,,100.0,149.0,100.0,3.0,R Elbow pain,0.0,White
4,M,WALK IN,49.0,98.0,86.0,,97.0,151.0,74.0,3.0,N/V,0.0,BLACK
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368970,M,WALK IN,74.0,98.2,95.0,,95.0,119.0,69.0,3.0,DYSPNEA/COUGH,3.0,OTHER
368971,M,AMBULANCE,77.0,103.0,100.0,20.0,100.0,174.0,125.0,1.0,MS CHANGES,10.0,White
368972,M,UNKNOWN,65.0,,107.0,20.0,99.0,124.0,57.0,1.0,"Transfer, SDH",,Missing
368973,F,AMBULANCE,76.0,98.6,93.0,,95.0,90.0,55.0,2.0,"Fever, Transfer",0.0,White


In [11]:
y = train['stay_length_minutes']

In [12]:
X

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,chiefcomplaint,pain_cleaned_advanced,race_condensed
0,M,WALK IN,63.0,98.4,95.0,,99.0,146.0,73.0,3.0,Headache,7.0,BLACK
1,M,AMBULANCE,72.0,97.8,84.0,20.0,99.0,138.0,91.0,2.0,PALPS RESOLVED,0.0,White
2,F,AMBULANCE,75.0,98.0,72.0,,96.0,133.0,62.0,2.0,"s/p Fall, SDH, R Thumb injury",2.0,White
3,M,UNKNOWN,60.0,97.7,94.0,,100.0,149.0,100.0,3.0,R Elbow pain,0.0,White
4,M,WALK IN,49.0,98.0,86.0,,97.0,151.0,74.0,3.0,N/V,0.0,BLACK
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368970,M,WALK IN,74.0,98.2,95.0,,95.0,119.0,69.0,3.0,DYSPNEA/COUGH,3.0,OTHER
368971,M,AMBULANCE,77.0,103.0,100.0,20.0,100.0,174.0,125.0,1.0,MS CHANGES,10.0,White
368972,M,UNKNOWN,65.0,,107.0,20.0,99.0,124.0,57.0,1.0,"Transfer, SDH",,Missing
368973,F,AMBULANCE,76.0,98.6,93.0,,95.0,90.0,55.0,2.0,"Fever, Transfer",0.0,White


In [23]:
model.fit(X, y)

In [24]:
y_pred = model.predict(X)

In [25]:
r2_score(y, y_pred)

0.17679947936796003

In [26]:
np.sqrt(mean_squared_error(y, y_pred))

356.80365714344583

In [None]:
pd.Series(np.abs(y - y_pred)).describe()

count    370197.000000
mean        224.565150
std         300.724702
min           0.000126
25%          74.263515
50%         154.349209
75%         261.008386
max        8300.554503
Name: stay_length_minutes, dtype: float64

In [29]:
test = pd.read_csv('DataCleaning/test.csv')
test = test.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime'])

test['race_condensed'] = test['race_condensed'].fillna('Missing')

In [30]:
X_test = test.drop(columns=['stay_length_minutes'])
y_test  = test['stay_length_minutes']
y_pred_test = model.predict(X_test)

In [31]:
np.sqrt(mean_squared_error(y_test, y_pred_test))

368.1487395029899

In [32]:
r2_score(y_true=y_test, y_pred=y_pred_test)

0.14904018720791146

In [None]:
train

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,stay_length_hours,stay_length_minutes,pain_cleaned_advanced,race_condensed
0,F,WALK IN,66.0,97.2,67.0,18.0,100.0,192.0,93.0,3.0,15.333333,920.0,0.0,BLACK
1,F,AMBULANCE,77.0,98.0,60.0,16.0,100.0,142.0,48.0,2.0,3.650000,219.0,,BLACK
2,F,WALK IN,47.0,97.8,85.0,18.0,100.0,126.0,81.0,3.0,7.166667,430.0,9.0,BLACK
3,M,WALK IN,67.0,97.3,110.0,18.0,98.0,132.0,52.0,2.0,3.716667,223.0,10.0,White
4,F,WALK IN,25.0,99.0,77.0,20.0,100.0,132.0,78.0,3.0,3.266667,196.0,3.0,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370192,M,WALK IN,61.0,98.2,127.0,18.0,100.0,173.0,88.0,3.0,3.783333,227.0,8.0,White
370193,F,AMBULANCE,87.0,,80.0,18.0,96.0,154.0,69.0,3.0,8.800000,528.0,8.0,White
370194,F,AMBULANCE,28.0,98.5,80.0,18.0,99.0,113.0,61.0,4.0,20.016667,1201.0,0.0,White
370195,M,WALK IN,46.0,98.6,82.0,18.0,99.0,105.0,65.0,4.0,3.433333,206.0,8.0,BLACK


In [None]:
# group1 = train[train['gender'] == "F"]['stay_length_minutes']
# group2 = train[train['gender'] == "M"]['stay_length_minutes']

In [None]:
group1 = train[train['gender'] == "F"]['stay_length_minutes']
group2 = train[train['gender'] == "M"]['stay_length_minutes']

In [None]:
def statistic(x, y):
    return np.mean(x) - np.mean(y)

# Perform permutation test
result = permutation_test(
    (group1, group2), 
    statistic,
    permutation_type='independent',  # For independent samples
    n_resamples=500,               # Number of permutations
    alternative='two-sided'          # Test direction
)

print(f"Observed Statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed Statistic: 0.7621521364051205
P-value: 0.562874251497006


In [None]:
groups = [train[train['arrival_transport'] == cat]['stay_length_minutes'].values for cat in train['arrival_transport'].unique()]

# Define the F-statistic as the test statistic
def f_statistic(*groups):
    return f_oneway(*groups).statistic

# Perform permutation test
result = permutation_test(
    groups,
    f_statistic,
    permutation_type='independent', 
    n_resamples=500,              # Adjust based on computational limits
    alternative='greater'            # ANOVA is one-tailed (test for larger F)
)

In [None]:
print(f"Observed F-statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed F-statistic: 2047.824460047073
P-value: 0.001996007984031936
