## Importing Required Dependencies

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from fastapi import FastAPI
import os

## Reading Data

Using minimal data since it'd take a lot of time.

In [7]:
df = pd.read_csv(r"\ehr_data.csv")
dataset = Dataset.from_pandas(df)
train_dataset = dataset.select(range(800))  # First 800 for training
eval_dataset = dataset.select(range(800, 1000))  # Last 200 for evaluation

## Loading ClinicalBERT and Tokenizer

In [8]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Using CPU 

In [9]:
device = torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Preprocessing

In [5]:
# Preprocessing Function
def preprocess_function(examples):
    return tokenizer(
        examples["note"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

In [6]:
# Applying preprocessing
encoded_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["note"])
encoded_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=["note"])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
# Renaming 'label' to 'labels'
encoded_train = encoded_train.rename_column("label", "labels")
encoded_eval = encoded_eval.rename_column("label", "labels")

# Defining Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    auc = roc_auc_score(labels, logits[:, 1])
    return {"accuracy": acc, "auc": auc}

## Training Arguments for CPU 

In [8]:
output_dir="C:\Users\"
logging_dir= "C:\Users\"

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,  
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1,
    warmup_steps=100,  
    weight_decay=0.01,
    logging_dir=logging_dir,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,  
    fp16=False,  # Disableed mixed precision (GPU-only feature)
)

## Training Model

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    compute_metrics=compute_metrics,
)

In [11]:
print("Starting training...")
trainer.train()
print("Training completed successfully!")

Starting training...


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 0.7067, 'grad_norm': 11.279709815979004, 'learning_rate': 5e-06, 'epoch': 0.05}
{'loss': 0.7008, 'grad_norm': 10.616595268249512, 'learning_rate': 1e-05, 'epoch': 0.1}
{'loss': 0.7174, 'grad_norm': 9.82299518585205, 'learning_rate': 1.5e-05, 'epoch': 0.15}
{'loss': 0.6754, 'grad_norm': 11.688441276550293, 'learning_rate': 2e-05, 'epoch': 0.2}
{'loss': 0.708, 'grad_norm': 8.219525337219238, 'learning_rate': 2.5e-05, 'epoch': 0.25}
{'loss': 0.7424, 'grad_norm': 4.435360908508301, 'learning_rate': 3e-05, 'epoch': 0.3}
{'loss': 0.6821, 'grad_norm': 8.464441299438477, 'learning_rate': 3.5e-05, 'epoch': 0.35}
{'loss': 0.6842, 'grad_norm': 7.59079647064209, 'learning_rate': 4e-05, 'epoch': 0.4}
{'loss': 0.6851, 'grad_norm': 8.762849807739258, 'learning_rate': 4.5e-05, 'epoch': 0.45}
{'loss': 0.6151, 'grad_norm': 3.411376476287842, 'learning_rate': 5e-05, 'epoch': 0.5}
{'loss': 0.5605, 'grad_norm': 7.474760055541992, 'learning_rate': 4.5e-05, 'epoch': 0.55}
{'loss': 0.6129, 'grad_norm

  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.5936597585678101, 'eval_accuracy': 0.75, 'eval_auc': 0.7407910628019323, 'eval_runtime': 320.4492, 'eval_samples_per_second': 0.624, 'eval_steps_per_second': 0.624, 'epoch': 1.0}
{'train_runtime': 5137.39, 'train_samples_per_second': 0.156, 'train_steps_per_second': 0.039, 'train_loss': 0.6519443368911744, 'epoch': 1.0}
Training completed successfully!


## Saving Fine-TUned Model

In [None]:
model.save_pretrained(r"\clinicalbert_readmission_model")
tokenizer.save_pretrained(r"\clinicalbert_readmission_model")

## Readmission Risk score

In [10]:
# 8. Inference Function
def predict_readmission(note, model, tokenizer):
    inputs = tokenizer(note, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Ensure inputs are on CPU
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    risk_score = probs[0][1].item()
    return risk_score

## Checking Confluence 

In [11]:
loaded_model = AutoModelForSequenceClassification.from_pretrained(r"\clinicalbert_readmission_model")
loaded_model.to(device)
loaded_tokenizer = AutoTokenizer.from_pretrained(r"\clinicalbert_readmission_model")

In [16]:
# Example prediction
clinical_note = "Asthmatic patient with recent lung tests, non-compliant with meds."
risk = predict_readmission(clinical_note, loaded_model, loaded_tokenizer)
print(f"Readmission Risk Score: {risk:.3f}")

Readmission Risk Score: 0.801


## Basic FastAPI 

In [19]:
app = FastAPI()

In [21]:
@app.post("/predict")
async def predict(note: str):
    risk_score = predict_readmission(note, loaded_model, loaded_tokenizer)
    return {
        "risk_score": risk_score,
        "note": note,
        "interpretation": "High risk" if risk_score > 0.7 else "Low risk"
    }