### 🧩 Simulate Example Clinical Data

* Structured: age, blood pressure, cholesterol, cognitive score
* Unstructured: short doctor’s note

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Simulate structured + text data
n = 200
data = {
    "Age": np.random.randint(50, 90, n),
    "BloodPressure": np.random.randint(110, 180, n),
    "Cholesterol": np.random.randint(150, 300, n),
    "CognitiveScore": np.random.normal(25, 5, n),
    "DoctorNotes": np.random.choice([
        "Patient shows memory loss and confusion",
        "Normal cognitive response and recall",
        "Reports occasional disorientation",
        "Severe short-term memory issues observed",
        "Stable, no signs of cognitive decline"
    ], n),
}

df = pd.DataFrame(data)

# Create a target variable (simulated risk label)
df["AlzheimerRisk"] = np.where(
    (df["CognitiveScore"] < 22) | (df["DoctorNotes"].str.contains("memory|confusion|disorientation", case=False)), 
    1, 
    0
)

df.head()


Unnamed: 0,Age,BloodPressure,Cholesterol,CognitiveScore,DoctorNotes,AlzheimerRisk
0,88,168,250,26.712285,"Stable, no signs of cognitive decline",0
1,78,179,172,34.557648,Normal cognitive response and recall,0
2,64,112,159,26.597688,Reports occasional disorientation,1
3,57,129,218,25.96639,Reports occasional disorientation,1
4,70,168,249,32.980715,Reports occasional disorientation,1


### 🔤 Get Embeddings from BioGPT/BioBERT (a pretrained biomedical transformer)
* tokenizer: breaks sentences into tokens (words/subwords)
* model: a neural network that maps tokens to numerical embeddings

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load BioGPT or BioBERT (BioBERT loads faster)
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get embedding for a text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Average pooling of token embeddings
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# Apply to all notes
embeddings = np.vstack(df["DoctorNotes"].apply(get_embedding).values)


  torch.utils._pytree._register_pytree_node(
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


### 🧮 Combine Structured + Text Features
Combine the numeric features with the embeddings:

In [3]:
from sklearn.preprocessing import StandardScaler

# Structured features
structured_features = df[["Age", "BloodPressure", "Cholesterol", "CognitiveScore"]].values

# Normalize structured features
scaler = StandardScaler()
structured_scaled = scaler.fit_transform(structured_features)

# Concatenate with embeddings
X = np.hstack([structured_scaled, embeddings])
y = df["AlzheimerRisk"].values


### 🧠 Train a Simple ML Model

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        27

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

AUC: 1.0


### 🩸 Interpret Results (Risk Stratification)
group patients by predicted probability to simulate risk stratification

In [5]:
y_prob = model.predict_proba(X_test)[:, 1]
risk_levels = pd.cut(y_prob, bins=[0, 0.33, 0.66, 1], labels=["Low", "Medium", "High"])
risk_summary = pd.DataFrame({"PredictedRisk": y_prob, "RiskLevel": risk_levels})
risk_summary.head(10)


Unnamed: 0,PredictedRisk,RiskLevel
0,1.0,High
1,1.0,High
2,0.2,Low
3,1.0,High
4,1.0,High
5,1.0,High
6,1.0,High
7,1.0,High
8,1.0,High
9,1.0,High
