# Predicting Infection Risk from CPT, Microbiology, and Lab Events
This notebook builds a machine learning and deep learning pipeline to predict infection risk in ICU patients using MIMIC-III data.

In [3]:
# Step 1: Load Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [4]:
# Step 2: Load Data (replace paths as needed)
cptevents = pd.read_csv('CPTEVENTS.csv')
microbio = pd.read_csv('MICROBIOLOGYEVENTS.csv')
labevents = pd.read_csv('LABEVENTS.csv')
admissions = pd.read_csv('ADMISSIONS.csv')

  cptevents = pd.read_csv('CPTEVENTS.csv')


In [5]:
# Step 3: Define Target Variable (Infection Risk)
infected_hadm_ids = microbio[microbio['ORG_NAME'].notnull()]['HADM_ID'].unique()
admissions['infection_label'] = admissions['HADM_ID'].isin(infected_hadm_ids).astype(int)

In [6]:
# Step 4: Feature Engineering
# CPT Code Features
cpt_features = cptevents.groupby(['HADM_ID', 'CPT_CD']).size().unstack(fill_value=0)

In [7]:
# Lab Event Features
important_items = [50811, 50912, 50931]  # WBC, Glucose, Lactate
lab_filtered = labevents[labevents['ITEMID'].isin(important_items)]
lab_agg = lab_filtered.groupby(['HADM_ID', 'ITEMID'])['VALUENUM'].agg(['mean', 'min', 'max']).unstack()
lab_agg.columns = ['{}_{}'.format(item, stat) for item, stat in lab_agg.columns]
lab_agg = lab_agg.fillna(0)

In [15]:
# Step 5: Combine Features and Labels
features = cpt_features.join(lab_agg, how='outer').fillna(0)
features = features.join(admissions.set_index('HADM_ID')['infection_label'], how='inner')
features.columns = features.columns.astype(str)

In [16]:
# Step 6: ML Modeling
X = features.drop(columns=['infection_label'])
y = features['infection_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [17]:
# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Logistic Regression Report:\n", classification_report(y_test, y_pred))

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.73      0.91      0.81      6270
           1       0.78      0.47      0.59      4065

    accuracy                           0.74     10335
   macro avg       0.75      0.69      0.70     10335
weighted avg       0.75      0.74      0.72     10335



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Report:
               precision    recall  f1-score   support

           0       0.75      0.87      0.81      6270
           1       0.73      0.56      0.64      4065

    accuracy                           0.75     10335
   macro avg       0.74      0.71      0.72     10335
weighted avg       0.75      0.75      0.74     10335



## LSTM Model Preparation

In [24]:
# Step 7: Prepare Lab Sequences for LSTM
def create_lab_sequence(hadm_id, item_ids, max_len=24):
    patient_data = lab_filtered[lab_filtered['HADM_ID'] == hadm_id]
    seq = []
    for item in item_ids:
        values = patient_data[patient_data['ITEMID'] == item].sort_values('CHARTTIME')['VALUENUM'].values
        padded = np.pad(values[:max_len], (0, max(0, max_len - len(values))), 'constant', constant_values=0)
        seq.append(padded)
    return np.stack(seq, axis=1)  # shape: (max_len, num_features)

hadm_ids = admissions['HADM_ID'].unique()
X_seq = []
y_seq = []
for hid in hadm_ids:
    if hid in admissions['HADM_ID'].values:
        try:
            seq = create_lab_sequence(hid, important_items)
            X_seq.append(seq)
            y_seq.append(int(hid in infected_hadm_ids))
        except:
            continue
X_seq = np.stack(X_seq)
y_seq = np.array(y_seq)

In [29]:

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Normalize input features
X_seq = (X_seq - X_seq.mean(axis=0)) / (X_seq.std(axis=0) + 1e-8)
X_seq = np.nan_to_num(X_seq, nan=0.0)

# Convert to tensors
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y_seq, dtype=torch.float32).unsqueeze(1)

# Dataset and dataloaders
train_ds = TensorDataset(X_tensor[:int(0.8 * len(X_tensor))], y_tensor[:int(0.8 * len(X_tensor))])
test_ds = TensorDataset(X_tensor[int(0.8 * len(X_tensor)):], y_tensor[int(0.8 * len(X_tensor)):])
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32)

# LSTM model definition
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])  # raw logits

# Initialize model
model = LSTMClassifier(input_dim=X_seq.shape[2], hidden_dim=32, num_layers=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    epoch_loss = 0
    for xb, yb in train_dl:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)  # gradient clipping
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}: loss = {epoch_loss / len(train_dl):.4f}")


Epoch 1: loss = nan
Epoch 2: loss = nan
Epoch 3: loss = nan
Epoch 4: loss = nan
Epoch 5: loss = nan
Epoch 6: loss = nan
Epoch 7: loss = nan
Epoch 8: loss = nan
Epoch 9: loss = nan
Epoch 10: loss = nan
