In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nlp = spacy.load("en_core_sci_sm")

  serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])  # type: ignore[misc]


In [None]:
# Load your CSV and rename target column
df = pd.read_csv("your_data.csv")  # replace with your file path
df = df.rename(columns={"site": "target"})
df["ha_procedure_description"] = df["ha_procedure_description"].fillna("")

# Preprocessing function using scispaCy (tokenization + lemmatization)
def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

df["clean_text"] = df["ha_procedure_description"].apply(clean_text)

# Load Bio_ClinicalBERT from Hugging Face
device = torch.device("cpu")  # CPU only setup
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)
bert_model.eval()

# Function to get BERT [CLS] embedding for a sentence
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs.to(device))
    cls_embedding = outputs.last_hidden_state[0][0].cpu().numpy()
    return cls_embedding

# Generate embeddings for all cleaned texts
df["embedding"] = df["clean_text"].apply(get_bert_embedding)

# Prepare features and target arrays
X = list(df["embedding"])
y = df["target"]

# Train-test split (stratified by target)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))