In [None]:
# this is a test

In [1]:
# Step 1: Install necessary libraries (Run this in your terminal or notebook cell)
# !pip install transformers torch pandas

import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import os

In [2]:
# 1. The Dataset
DATA_PATH = "/home/ruima/code/delaunan/clintrialpredict/data"
df = pd.read_csv(os.path.join(DATA_PATH, 'project_data.csv'))
df.columns
#df.shape
#df.head

Index(['nct_id', 'start_date_type', 'start_date', 'study_type',
       'overall_status', 'phase', 'number_of_arms', 'why_stopped', 'target',
       'start_year', 'phase_ordinal', 'covid_exposure', 'includes_us',
       'is_international', 'agency_class', 'allocation', 'intervention_model',
       'primary_purpose', 'masking', 'gender', 'healthy_volunteers', 'adult',
       'child', 'older_adult', 'num_primary_endpoints', 'best_pathology',
       'therapeutic_area', 'therapeutic_subgroup_name', 'competition_broad',
       'competition_niche', 'txt_tags', 'txt_criteria'],
      dtype='object')

In [3]:
# ---------------------------------------------------------
# 1. SETUP & MODEL LOADING
# ---------------------------------------------------------
# Detect hardware
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load BioBERT (fine-tuned on biomedical text)
model_name = "dmis-lab/biobert-v1.1"
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

Using device: cpu
Loading dmis-lab/biobert-v1.1...


2025-12-04 09:45:04.258459: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-04 09:45:04.283968: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-04 09:45:04.477417: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-04 09:45:04.715885: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-04 09:45:04.914985: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

In [4]:
# ---------------------------------------------------------
# 2. EMBEDDING FUNCTION
# ---------------------------------------------------------
def get_biobert_embeddings(text_list, batch_size=32):
    """
    Generates embeddings for a list of texts using the [CLS] token of BioBERT.
    """
    model.eval() # Set model to evaluation mode
    all_embeddings = []

    # Calculate total batches for progress tracking
    total_batches = (len(text_list) // batch_size) + 1

    print(f"Processing {len(text_list)} items in batches of {batch_size}...")

    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i : i + batch_size]

        # Tokenize
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        # Inference
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract [CLS] token (first token of last hidden state)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]

        # Move to CPU and convert to numpy
        all_embeddings.append(cls_embeddings.cpu().numpy())

        # Optional: Print progress every 10 batches
        if (i // batch_size) % 10 == 0:
            print(f"Batch {i // batch_size}/{total_batches} done.")

    return np.vstack(all_embeddings)

In [5]:
# ---------------------------------------------------------
# 3. EXECUTION WITH FIX
# ---------------------------------------------------------
# Assuming 'df' is your main dataframe loaded previously
df_sample = df.copy() # Work on a copy to be safe

# --- CRITICAL FIX START ---
print("Sanitizing text data...")
# 1. Fill NaNs with empty string
# 2. Force type to string (prevents floats/objects from breaking tokenizer)
df_sample['txt_criteria'] = df_sample['txt_criteria'].fillna("").astype(str)
# --- CRITICAL FIX END ---

print("Generating embeddings...")
# Running with batch_size=16 (Safe for T4 GPU).
# If on CPU, reduce to 2 or 4.
embeddings = get_biobert_embeddings(df_sample['txt_criteria'].tolist(), batch_size=16)

# Assign embeddings back to DataFrame
df_sample['criteria_embedding'] = list(embeddings)

print("Success! Embeddings generated.")
print(f"Embedding shape: {df_sample['criteria_embedding'].iloc[0].shape}")
print(df_sample[['nct_id', 'criteria_embedding']].head(2))

Sanitizing text data...
Generating embeddings...
Processing 105336 items in batches of 16...
Batch 0/6584 done.
Batch 10/6584 done.
Batch 20/6584 done.
Batch 30/6584 done.
Batch 40/6584 done.
Batch 50/6584 done.
Batch 60/6584 done.
Batch 70/6584 done.
Batch 80/6584 done.
Batch 90/6584 done.
Batch 100/6584 done.
Batch 110/6584 done.
Batch 120/6584 done.
Batch 130/6584 done.
Batch 140/6584 done.
Batch 150/6584 done.
Batch 160/6584 done.
Batch 170/6584 done.
Batch 180/6584 done.
Batch 190/6584 done.
Batch 200/6584 done.
Batch 210/6584 done.
Batch 220/6584 done.
Batch 230/6584 done.
Batch 240/6584 done.
Batch 250/6584 done.


KeyboardInterrupt: 