In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy.stats import permutation_test, f_oneway

In [2]:
train = pd.read_csv('train.csv')
train = train.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime', 'chiefcomplaint'])

train['race_condensed'] = train['race_condensed'].fillna('Missing')

In [3]:
numeric_vars = ['admission_age', 'temperature', 'heartrate', 'resprate', 'o2sat', 
                'sbp', 'dbp', 'pain_cleaned_advanced']
# categorical_vars = ['gender', 'arrival_transport', 'race_condensed']
categorical_vars = ['arrival_transport', 'race_condensed']

numeric = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025)),
    ('scaler', StandardScaler())
])

impute_acuity = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=100, random_state=2025))
])

categorical = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

impute_standardize = ColumnTransformer(
    transformers=[
        ('num', numeric, numeric_vars),
        ('passthrough_cols', impute_acuity, ['acuity']),
        ('cat', categorical, categorical_vars)
    ])

In [4]:
model = Pipeline(steps=[("pre", impute_standardize), ("model", LinearRegression())])

In [5]:
X = train.drop(columns=['stay_length_minutes', 'stay_length_hours'])

In [6]:
y = train['stay_length_minutes']

In [7]:
X

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,pain_cleaned_advanced,race_condensed
0,F,WALK IN,66.0,97.2,67.0,18.0,100.0,192.0,93.0,3.0,0.0,BLACK
1,F,AMBULANCE,77.0,98.0,60.0,16.0,100.0,142.0,48.0,2.0,,BLACK
2,F,WALK IN,47.0,97.8,85.0,18.0,100.0,126.0,81.0,3.0,9.0,BLACK
3,M,WALK IN,67.0,97.3,110.0,18.0,98.0,132.0,52.0,2.0,10.0,White
4,F,WALK IN,25.0,99.0,77.0,20.0,100.0,132.0,78.0,3.0,3.0,White
...,...,...,...,...,...,...,...,...,...,...,...,...
370192,M,WALK IN,61.0,98.2,127.0,18.0,100.0,173.0,88.0,3.0,8.0,White
370193,F,AMBULANCE,87.0,,80.0,18.0,96.0,154.0,69.0,3.0,8.0,White
370194,F,AMBULANCE,28.0,98.5,80.0,18.0,99.0,113.0,61.0,4.0,0.0,White
370195,M,WALK IN,46.0,98.6,82.0,18.0,99.0,105.0,65.0,4.0,8.0,BLACK


In [8]:
model.fit(X, y)

In [9]:
y_pred = model.predict(X)

In [10]:
r2_score(y, y_pred)

0.04077288397863976

In [11]:
mean_squared_error(y, y_pred)

147927.70285867294

In [12]:
test = pd.read_csv('test.csv')
test = test.drop(columns = ['subject_id', 'hadm_id', 'stay_id', 'race', 'pain', 'intime', 'outtime', 'chiefcomplaint'])

test['race_condensed'] = test['race_condensed'].fillna('Missing')

In [13]:
X_test = test.drop(columns=['stay_length_minutes', 'stay_length_hours'])
y_test  = test['stay_length_minutes']
y_pred_test = model.predict(X_test)

In [14]:
mean_squared_error(y_test, y_pred_test)

162377.270705346

In [15]:
r2_score(y_true=y_test, y_pred=y_pred_test)

0.037449450916643534

In [16]:
train

Unnamed: 0,gender,arrival_transport,admission_age,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,stay_length_hours,stay_length_minutes,pain_cleaned_advanced,race_condensed
0,F,WALK IN,66.0,97.2,67.0,18.0,100.0,192.0,93.0,3.0,15.333333,920.0,0.0,BLACK
1,F,AMBULANCE,77.0,98.0,60.0,16.0,100.0,142.0,48.0,2.0,3.650000,219.0,,BLACK
2,F,WALK IN,47.0,97.8,85.0,18.0,100.0,126.0,81.0,3.0,7.166667,430.0,9.0,BLACK
3,M,WALK IN,67.0,97.3,110.0,18.0,98.0,132.0,52.0,2.0,3.716667,223.0,10.0,White
4,F,WALK IN,25.0,99.0,77.0,20.0,100.0,132.0,78.0,3.0,3.266667,196.0,3.0,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370192,M,WALK IN,61.0,98.2,127.0,18.0,100.0,173.0,88.0,3.0,3.783333,227.0,8.0,White
370193,F,AMBULANCE,87.0,,80.0,18.0,96.0,154.0,69.0,3.0,8.800000,528.0,8.0,White
370194,F,AMBULANCE,28.0,98.5,80.0,18.0,99.0,113.0,61.0,4.0,20.016667,1201.0,0.0,White
370195,M,WALK IN,46.0,98.6,82.0,18.0,99.0,105.0,65.0,4.0,3.433333,206.0,8.0,BLACK


In [17]:
group1 = train[train['gender'] == "F"]['stay_length_minutes']
group2 = train[train['gender'] == "M"]['stay_length_minutes']

In [18]:
def statistic(x, y):
    return np.mean(x) - np.mean(y)

# Perform permutation test
result = permutation_test(
    (group1, group2), 
    statistic,
    permutation_type='independent',  # For independent samples
    n_resamples=500,               # Number of permutations
    alternative='two-sided'          # Test direction
)

print(f"Observed Statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed Statistic: 0.6745120940743732
P-value: 0.5988023952095808


In [26]:
groups = [train[train['arrival_transport'] == cat]['stay_length_minutes'].values for cat in train['arrival_transport'].unique()]

# Define the F-statistic as the test statistic
def f_statistic(*groups):
    return f_oneway(*groups).statistic

# Perform permutation test
result = permutation_test(
    groups,
    f_statistic,
    permutation_type='independent',  # Shuffle labels across all groups
    n_resamples=500,              # Adjust based on computational limits
    alternative='greater'            # ANOVA is one-tailed (test for larger F)
)

In [27]:
print(f"Observed F-statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

Observed F-statistic: 2107.4459099432165
P-value: 0.001996007984031936
