In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy.stats import permutation_test, f_oneway

In [25]:
train = pd.read_csv('DataCleaning/train.csv')
chief_complaint = train[['chiefcomplaint']]
train = train.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime', 'chiefcomplaint'])

train['race_condensed'] = train['race_condensed'].fillna('Missing')

In [3]:
# numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
#                 'sbp', 'dbp', 'acuity', 'stay_length_minutes', 'pain_cleaned_advanced']
numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
                'sbp', 'dbp', 'acuity', 'pain_cleaned_advanced']
# categorical_vars = ['gender', 'arrival_transport', 'race_condensed']
categorical_vars = ['arrival_transport', 'race_condensed']

numeric = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
    ('scaler', StandardScaler())
])

impute_acuity = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025))
])

categorical = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric, numeric_vars),
        ('passthrough_cols', impute_acuity, ['acuity']),
        ('cat', categorical, categorical_vars)
    ])

In [4]:
model = Pipeline(steps=[("pre", impute_standardize), ("model", LinearRegression())])

In [5]:
train.head()

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,...,cc_90,cc_91,cc_92,cc_93,cc_94,cc_95,cc_96,cc_97,cc_98,cc_99
0,M,WALK IN,63.0,98.4,95.0,,99.0,146.0,73.0,3.0,...,-0.029966,0.080886,-0.002972,-0.030255,-0.025612,-0.047326,-0.055742,0.043237,-0.115148,0.012685
1,M,AMBULANCE,72.0,97.8,84.0,20.0,99.0,138.0,91.0,2.0,...,-0.019207,0.03456,0.037256,0.639393,-0.359927,-0.313329,0.309991,0.262597,0.284354,0.280713
2,F,AMBULANCE,75.0,98.0,72.0,,96.0,133.0,62.0,2.0,...,-0.118319,-0.104483,-0.034958,-0.203249,-0.079202,0.032316,0.127788,-0.032426,0.01115,0.192132
3,M,UNKNOWN,60.0,97.7,94.0,,100.0,149.0,100.0,3.0,...,-0.050925,-0.191968,-0.11196,-0.013973,0.037628,0.029144,-0.02442,0.042527,0.045887,-0.045725
4,M,WALK IN,49.0,98.0,86.0,,97.0,151.0,74.0,3.0,...,0.000102,-0.042773,0.016004,0.009657,-0.158851,0.046155,-0.19999,-0.006561,0.086824,-0.141093


In [6]:
X = train.drop(columns=['stay_length_minutes'])

In [7]:
y = train['stay_length_minutes']

In [19]:
y_avg = y.mean()
y_avg = np.array([y_avg] * y.shape[0])

In [20]:
root_mean_squared_error(y_true=y, y_pred=y_avg)

393.25701547079234

In [8]:
X

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,...,cc_90,cc_91,cc_92,cc_93,cc_94,cc_95,cc_96,cc_97,cc_98,cc_99
0,M,WALK IN,63.0,98.4,95.0,,99.0,146.0,73.0,3.0,...,-0.029966,0.080886,-0.002972,-0.030255,-0.025612,-0.047326,-0.055742,0.043237,-0.115148,0.012685
1,M,AMBULANCE,72.0,97.8,84.0,20.0,99.0,138.0,91.0,2.0,...,-0.019207,0.034560,0.037256,0.639393,-0.359927,-0.313329,0.309991,0.262597,0.284354,0.280713
2,F,AMBULANCE,75.0,98.0,72.0,,96.0,133.0,62.0,2.0,...,-0.118319,-0.104483,-0.034958,-0.203249,-0.079202,0.032316,0.127788,-0.032426,0.011150,0.192132
3,M,UNKNOWN,60.0,97.7,94.0,,100.0,149.0,100.0,3.0,...,-0.050925,-0.191968,-0.111960,-0.013973,0.037628,0.029144,-0.024420,0.042527,0.045887,-0.045725
4,M,WALK IN,49.0,98.0,86.0,,97.0,151.0,74.0,3.0,...,0.000102,-0.042773,0.016004,0.009657,-0.158851,0.046155,-0.199990,-0.006561,0.086824,-0.141093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368970,M,WALK IN,74.0,98.2,95.0,,95.0,119.0,69.0,3.0,...,0.049802,-0.115836,0.100021,0.118852,-0.018708,0.022687,0.020013,-0.104315,-0.210426,-0.035530
368971,M,AMBULANCE,77.0,103.0,100.0,20.0,100.0,174.0,125.0,1.0,...,0.190211,0.077263,0.051307,0.098574,-0.111898,0.095966,0.351170,0.233599,-0.132786,-0.170117
368972,M,UNKNOWN,65.0,,107.0,20.0,99.0,124.0,57.0,1.0,...,0.020769,0.119344,-0.099505,0.232883,0.139065,0.162301,0.129753,0.025093,0.058227,0.034400
368973,F,AMBULANCE,76.0,98.6,93.0,,95.0,90.0,55.0,2.0,...,-0.093027,0.020890,-0.082775,-0.154690,0.005136,-0.032273,-0.007780,0.099924,-0.055264,-0.086615


In [9]:
model.fit(X, y)

In [10]:
y_pred = model.predict(X)

In [11]:
r2_score(y, y_pred)

0.04110367874577803

In [16]:
root_mean_squared_error(y, y_pred)

385.09005686731

In [14]:
test = pd.read_csv('DataCleaning/test.csv')
test = test.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime', 'chiefcomplaint'])

test['race_condensed'] = test['race_condensed'].fillna('Missing')

In [None]:
X_test = test.drop(columns=['stay_length_minutes', 'stay_length_hours'])
y_test  = test['stay_length_minutes']
y_pred_test = model.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_test)

162377.270705346

In [None]:
r2_score(y_true=y_test, y_pred=y_pred_test)

0.037449450916643534

In [None]:
train

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,stay_length_hours,stay_length_minutes,pain_cleaned_advanced,race_condensed
0,F,WALK IN,66.0,97.2,67.0,18.0,100.0,192.0,93.0,3.0,15.333333,920.0,0.0,BLACK
1,F,AMBULANCE,77.0,98.0,60.0,16.0,100.0,142.0,48.0,2.0,3.650000,219.0,,BLACK
2,F,WALK IN,47.0,97.8,85.0,18.0,100.0,126.0,81.0,3.0,7.166667,430.0,9.0,BLACK
3,M,WALK IN,67.0,97.3,110.0,18.0,98.0,132.0,52.0,2.0,3.716667,223.0,10.0,White
4,F,WALK IN,25.0,99.0,77.0,20.0,100.0,132.0,78.0,3.0,3.266667,196.0,3.0,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370192,M,WALK IN,61.0,98.2,127.0,18.0,100.0,173.0,88.0,3.0,3.783333,227.0,8.0,White
370193,F,AMBULANCE,87.0,,80.0,18.0,96.0,154.0,69.0,3.0,8.800000,528.0,8.0,White
370194,F,AMBULANCE,28.0,98.5,80.0,18.0,99.0,113.0,61.0,4.0,20.016667,1201.0,0.0,White
370195,M,WALK IN,46.0,98.6,82.0,18.0,99.0,105.0,65.0,4.0,3.433333,206.0,8.0,BLACK


In [None]:
group1 = train[train['gender'] == "F"]['stay_length_minutes']
group2 = train[train['gender'] == "M"]['stay_length_minutes']

In [None]:
def statistic(x, y):
    return np.mean(x) - np.mean(y)

# Perform permutation test
result = permutation_test(
    (group1, group2), 
    statistic,
    permutation_type='independent',  # For independent samples
    n_resamples=500,               # Number of permutations
    alternative='two-sided'          # Test direction
)

print(f"Observed Statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed Statistic: 0.6745120940743732
P-value: 0.5988023952095808


In [None]:
groups = [train[train['arrival_transport'] == cat]['stay_length_minutes'].values for cat in train['arrival_transport'].unique()]

# Define the F-statistic as the test statistic
def f_statistic(*groups):
    return f_oneway(*groups).statistic

# Perform permutation test
result = permutation_test(
    groups,
    f_statistic,
    permutation_type='independent',  # Shuffle labels across all groups
    n_resamples=500,              # Adjust based on computational limits
    alternative='greater'            # ANOVA is one-tailed (test for larger F)
)

In [None]:
print(f"Observed F-statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed F-statistic: 2107.4459099432165
P-value: 0.001996007984031936


In [21]:
# XGBoost Pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', impute_standardize),
    ('model', XGBRegressor(n_estimators=100, learning_rate=0.2, random_state=42))
])

In [22]:
xgb_pipeline.fit(X, y)
y_pred_xgb = xgb_pipeline.predict(X)
rmse_xgb = root_mean_squared_error(y, y_pred_xgb)
r2_xgb = r2_score(y, y_pred_xgb)
print(rmse_xgb, r2_xgb)

369.33327108303735 0.11796888235439407


In [27]:
chief_complaint
chief_chest = chief_complaint[chief_complaint['chiefcomplaint'] == 'chest pain']
chief_chest

Unnamed: 0,chiefcomplaint
11093,chest pain


In [24]:
train.columns

Index(['gender', 'arrival_transport', 'admission_age', 'temperature',
       'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'acuity',
       ...
       'cc_90', 'cc_91', 'cc_92', 'cc_93', 'cc_94', 'cc_95', 'cc_96', 'cc_97',
       'cc_98', 'cc_99'],
      dtype='object', length=113)