# Readmission Risk Prediction with Random Forest

This notebook builds a Random Forest model to predict hospital readmission risk. It extends the Logistic Regression baseline by capturing non-linear relationships.

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("../data/hospital_data.csv")
df.head()

In [0]:
# Target
df["Readmission"] = df["Readmission"].str.strip().str.capitalize()
y = df["Readmission"].map({"No":0, "Yes":1})

# Features
X = df[["Age", "Gender", "Diagnosis", "Length_of_Stay", "Treatment", "Outcome"]]

In [0]:
# Preprocessing
categorical_features = ["Gender", "Diagnosis", "Treatment", "Outcome"]
numeric_features = ["Age", "Length_of_Stay"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Build pipeline with Random Forest
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

In [0]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit model
model.fit(X_train, y_train)

In [0]:
# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

# Evaluation
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

In [0]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()

In [0]:
# Feature importance visualization
feature_names = model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(categorical_features)
all_features = list(feature_names) + numeric_features

importances = model.named_steps['classifier'].feature_importances_
feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)[:15]

plt.figure(figsize=(8,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Top 15 Feature Importances - Random Forest")
plt.show()

Try it!

In [0]:
import ipywidgets as widgets
from IPython.display import display

# Widgets for user input
age = widgets.IntSlider(value=50, min=0, max=100, step=1, description="Age:")
gender = widgets.Dropdown(options=df["Gender"].unique(), description="Gender:")
diagnosis = widgets.Dropdown(options=df["Diagnosis"].unique(), description="Diagnosis:")
treatment = widgets.Dropdown(options=df["Treatment"].unique(), description="Treatment:")
length = widgets.IntSlider(value=5, min=1, max=60, step=1, description="Length of Stay:")

# Prediction function
def predict_readmission(age, gender, diagnosis, treatment, length):
    input_df = pd.DataFrame([{
        "Age": age,
        "Gender": gender,
        "Diagnosis": diagnosis,
        "Treatment": treatment,
        "Length_of_Stay": length,
        "Outcome": "Recovered"
    }])
    prob = model.predict_proba(input_df)[:, 1][0]
    pred = "Yes" if prob >= 0.5 else "No"
    print(f"Predicted Readmission: {pred} (probability = {prob:.2%})")

# Layout
ui = widgets.VBox([age, gender, diagnosis, treatment, length])
out = widgets.interactive_output(predict_readmission, {
    "age": age,
    "gender": gender,
    "diagnosis": diagnosis,
    "treatment": treatment,
    "length": length
})

display(ui, out)
