In [1]:
# readmission_modeling.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load and merge data
patients = pd.read_csv("../data/patients.csv")
visits = pd.read_csv("../data/hospital_visits.csv")
df = pd.merge(visits, patients, on="patient_nbr", how="left")

# Drop IDs
df = df.drop(columns=["encounter_id", "patient_nbr"])

# Binary target: has patient been readmitted?
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, drop_first=True)

# Train-test split
X = df_encoded.drop("readmitted", axis=1)
y = df_encoded["readmitted"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.89      1.00      0.94     19087
           1       0.99      0.05      0.10      2438

    accuracy                           0.89     21525
   macro avg       0.94      0.53      0.52     21525
weighted avg       0.90      0.89      0.85     21525

