In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import logging


# Setup logging
logging.basicConfig(filename='app.log', level=logging.INFO)

csv_path = 'C:\\Users\\Kevin\\Documents\\DAT158\\training_data.csv'
csv = pd.read_csv(csv_path)

#Drop vdate and id
csv.drop(columns=['vdate', 'id'], inplace=True)

# Drop rows with empty cells
csv.replace('', np.nan, inplace=True)
csv.dropna(axis=0, how='any', inplace=True)

# Normalize gender
csv['gender'] = csv['gender'].replace({'M': 0, 'F': 1})

# Normalize faculty
csv['facid'] = csv['facid'].replace({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4})

# Normalize flags (true,false)
for col in csv.columns[2:13]:
    csv[col] = csv[col].replace({True: 1, False: 0})


In [2]:
csv

Unnamed: 0,rcount,gender,dialysisrenalendstage,asthma,irondef,pneum,substancedependence,psychologicaldisordermajor,depress,psychother,...,sodium,glucose,bloodureanitro,creatinine,bmi,pulse,respiration,secondarydiagnosisnonicd9,facid,lengthofstay
0,4,0,0,0,0,0,0,1,0,0,...,137.150067,148.420532,23.0,0.955051,28.450268,63,6.7,1,4,7
1,1,0,0,0,0,0,0,0,0,0,...,140.643655,166.718526,9.0,1.385747,29.233048,81,6.5,0,1,3
2,1,1,0,0,0,0,0,0,0,0,...,138.459305,106.468142,12.0,0.816984,34.295535,75,6.5,1,1,5
3,0,1,0,0,0,0,0,0,0,0,...,138.667613,144.311236,12.0,1.082725,31.207914,75,6.5,1,1,2
4,3,1,0,1,0,0,0,0,0,0,...,133.463292,199.728635,12.0,1.081557,30.475797,89,6.5,7,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69994,0,0,0,0,0,0,1,1,0,0,...,133.917689,145.908390,34.0,1.236751,29.135903,71,6.5,1,4,5
69995,0,0,0,0,0,0,0,0,0,0,...,135.920323,166.056315,12.0,0.977975,29.716528,72,6.5,0,1,1
69996,2,1,0,0,1,0,0,0,0,0,...,135.945521,201.159436,13.0,1.375892,30.265735,78,6.3,1,2,5
69998,0,1,0,0,0,0,1,0,0,0,...,138.955792,181.016906,10.0,1.563906,30.261142,89,6.5,1,4,5


In [3]:
X = csv[csv.columns[0:-1]]

# only length of stay feature
y = csv['lengthofstay']

# Split data to training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)

print(f'Training data: {X_train.shape}\nTest data: {X_test.shape}')

Training data: (69084, 24)
Test data: (70, 24)


In [4]:
# Scaler
scaler = StandardScaler()

# Fit the scaler to the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [5]:
model.score(X_test_scaled, y_test)

0.9086462831858407

In [6]:
# Import and fix test_data.csv
csv_test = pd.read_csv('C:\\Users\\Kevin\\Documents\\DAT158\\test_data.csv')
csv_index = csv_test['id']
csv_test.drop(columns=['id'], inplace=True)
csv_test.replace('', np.nan, inplace=True)
csv_test.dropna(axis=0, how='any', inplace=True)
csv_test['gender'] = csv_test['gender'].replace({'M': 0, 'F': 1}).astype(int)
csv_test['facid'] = csv_test['facid'].replace({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}).astype(int)
for col in csv_test.columns[2:14]:
    csv_test[col] = csv_test[col].replace({True: 1, False: 0})


In [7]:
# Scaler for test_data.csv
scaler_test = StandardScaler()

# Fit the scaler to the training data
test_scaled = scaler_test.fit_transform(csv_test)

# Predict test_data.csv (submission file submission.csv with patient id and length of stay)
csv_test_pred = pd.Series(model.predict(test_scaled), name="lengthofstay")
result = pd.concat([csv_index, csv_test_pred], axis=1)
result

Unnamed: 0,id,lengthofstay
0,75721,2.25
1,80184,2.98
2,19864,7.74
3,76699,2.56
4,92991,3.29
...,...,...
29995,42648,6.00
29996,86306,3.98
29997,45466,5.87
29998,63724,5.15


In [8]:
result.to_csv('submission.csv', index=False)

In [9]:
# Train model for app
all_diagnoses = csv[csv.columns[2:13]]

X_train_app = pd.concat([csv['gender'], all_diagnoses], axis=1)
y_train_app = csv.lengthofstay

model_app = RandomForestRegressor(n_estimators=100, random_state=42)
model_app.fit(X_train_app, y_train_app)

In [10]:
y_train_app

0        7
1        3
2        5
3        2
4        8
        ..
69994    5
69995    1
69996    5
69998    5
69999    1
Name: lengthofstay, Length: 69154, dtype: int64

In [11]:
def predict_length_of_stay(age, gender, diagnoses):
    try:
        gender_numeric = 0 if gender == "Male" else 1
        user_diagnoses = diagnoses.split(",")
        diag_series = np.zeros(shape=(1,12),dtype=int)
        list_diag = ['gender'] + all_diagnoses.columns.tolist()
        df = pd.DataFrame(data=diag_series, columns=list_diag, dtype=int)

        df.loc[0,'gender'] = gender_numeric
        for d in user_diagnoses:
            if d in df.columns:
                df.loc[0,d] = 1
                
        predicted_length_of_stay = model_app.predict(df)
        return f"Predicted length of stay: {np.round(predicted_length_of_stay, 2)} days"
    except Exception as e:
        logging.error(f'Error: {str(e)}, Input: {[age, gender, diagnoses]}')
        return f"Error: {str(e)}"

In [12]:
# Gradio
import gradio as gr

iface = gr.Interface(
    fn=predict_length_of_stay,
    inputs=[
        gr.Number(label="Age"),
        gr.Radio(["Male", "Female"], label="Gender"),
        gr.Textbox(label="Diagnoses (comma-separated)"),
    ],
    outputs=gr.Textbox(label="Predicted length of stay (days)"),
    title="Hospital Length of Stay Predictor",
    description="Enter patient data to predict their length of stay in the hospital."
)

iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


