In [11]:
# Predicting Hospitalization Risk Using Logistic Regression

In [12]:
# 1: Imports and Setup

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Define paths for data, report, and visuals folder, create visuals folder if it doesn't exist
data_path = '../data/processed_data.csv'
report_path = '../reports/covid19_analysis_report.md'
visuals_path = '../visuals/'
os.makedirs(visuals_path, exist_ok=True)

In [13]:
# Load the processed dataset
df = pd.read_csv(data_path)

In [26]:
# Check hospitalization target
print("Hospitalization (hosp_yn) unique values and counts:")
print(df['hosp_yn'].value_counts(dropna=False))
print("\n")

Hospitalization (hosp_yn) unique values and counts:
hosp_yn
NaN    5936
0.0    2949
1.0     134
Name: count, dtype: int64




In [None]:
#  Prepare data and target variable

In [15]:
# Drop rows with missing hospitalization info and prepare target variable
df_hosp = df.dropna(subset=['hosp_yn']).copy()  # Keep only rows where hospitalization info exists
df_hosp['hosp_yn'] = df_hosp['hosp_yn'].astype(int)  # Convert target to integer (0 or 1)

In [16]:
# Feature selection and encoding

In [17]:
# Select features and encode categorical variables for modeling
X = df_hosp[['sex', 'age_group', 'race_ethnicity_combined', 'medcond_yn']].copy()

# One-hot encode categorical variables, drop first to avoid multicollinearity
X = pd.get_dummies(X, columns=['sex', 'age_group', 'race_ethnicity_combined', 'medcond_yn'], drop_first=True)

# Target variable
y = df_hosp['hosp_yn']

In [18]:
#  Train-test split

In [19]:
from sklearn.model_selection import train_test_split

# Stratified split to maintain class proportions (20% test data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [20]:
# Model training and prediction

In [21]:
# Initialize Logistic Regression with balanced class weights and sufficient iterations
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

In [22]:
# Evaluation and reporting

In [23]:
# Generate classification report and confusion matrix
clf_report = classification_report(y_test, y_pred, target_names=['No Hosp', 'Hosp'])
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results in notebook
print("=== Hospitalization Prediction Results ===\n")
print("Classification Report:\n", clf_report)
print("Confusion Matrix:\n", conf_matrix)

=== Hospitalization Prediction Results ===

Classification Report:
               precision    recall  f1-score   support

     No Hosp       0.99      0.38      0.55       590
        Hosp       0.06      0.89      0.12        27

    accuracy                           0.41       617
   macro avg       0.52      0.64      0.33       617
weighted avg       0.95      0.41      0.53       617

Confusion Matrix:
 [[226 364]
 [  3  24]]


In [25]:
# Append evaluation metrics to markdown report file
with open('../reports/covid19_analysis_report.md', 'a') as f:
    f.write("\n## Hospitalization Prediction Model\n")
    f.write("### Classification Report\n```\n")
    f.write(clf_report)
    f.write("\n```\n")
    f.write(f"### Confusion Matrix\n```\n{conf_matrix}\n```\n")

print("Hospitalization model evaluation saved to report.")

Hospitalization model evaluation saved to report.
