## Libraries

In [9]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from tqdm import tqdm
import os
import joblib
import re
import pandas as pd

In [10]:
# ==========================================
# STEP 1: LOAD DATA
# ==========================================

DATA_PATH = "/home/ruima/code/delaunan/clintrialpredict/data"
FILENAME = "project_data.csv"
FULL_PATH = os.path.join(DATA_PATH, FILENAME)
df = pd.read_csv(FULL_PATH, low_memory=False)
#print(f"   -> :white_check_mark: Success!")
#print(f"   -> Data Ready. Shape: {df.shape}")

In [11]:
# ==========================================
# Replacing empty values and NaN in txt_criteria column with "No criteria provided"
# ==========================================

placeholder = "No criteria provided"
df['txt_criteria'] = df['txt_criteria'].fillna(placeholder)  # replace NaN
df.loc[df['txt_criteria'].str.strip() == "", 'txt_criteria'] = placeholder  # replace empty strings

# Pattern to match various inclusion criteria prefixes at start of string
pattern = r"(?i)^(inclusion|key inclusion|eligibility) criteria[:~\d\*.\-]*\s*"

# Apply to the txt_criteria column
df['txt_criteria'] = df['txt_criteria'].str.replace(pattern, "", regex=True).str.strip()

In [None]:
# Install RAPIDS 23.12 (Python 3.10)
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh 23.12

In [None]:
# ==========================================
# FULL PIPELINE: BioBERT Embeddings + PCA
# ==========================================

# ================================
# 1️⃣ Load BioBERT
# ================================
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

#print(f"Using device: {device}")

# ================================
# 2️⃣ Embedding Function with Batching
# ================================
def get_biobert_embeddings(text_list, batch_size=32):
    """
    Tokenizes text and returns BioBERT CLS embeddings.
    """
    all_embeddings = []

    for i in tqdm(range(0, len(text_list), batch_size), desc="Generating embeddings"):
        batch_texts = text_list[i : i + batch_size]

        # Tokenize
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract CLS token
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: (batch, 768)
        all_embeddings.append(cls_embeddings.cpu().numpy())

    # Combine all batches
    return np.vstack(all_embeddings)

# ================================
# 3️⃣ Run embeddings on all rows
# ================================
# Make sure df_final exists and has 'txt_criteria'
text_data = df["txt_criteria"].tolist()

#print("Generating embeddings for all rows...")
embeddings = get_biobert_embeddings(text_data, batch_size=32)
#print(f"Raw embeddings shape: {embeddings.shape}")  # (num_rows, 768)


# ================================
# 4️⃣ PCA for dimensionality reduction
# ================================
REDUCED_DIM = 100  # reduce to 100 dimensions

#print(f"Reducing embeddings to {REDUCED_DIM} dimensions using PCA...")
pca = PCA(n_components=REDUCED_DIM)
embeddings_reduced = pca.fit_transform(embeddings)

#print(f"Reduced embeddings shape: {embeddings_reduced.shape}")  # (num_rows, 100)

# ================================
# 5️⃣ (Optional) Add reduced embeddings as columns to df
# ================================
import pandas as pd

embedding_cols = [f"emb_{i}" for i in range(REDUCED_DIM)]
embeddings_df = pd.DataFrame(embeddings_reduced, columns=embedding_cols)
df_with_embeddings = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)

#print(f"Final dataset shape with embeddings: {df_with_embeddings.shape}")

In [None]:
# ================================================
# Creating a dataset with nct_id and embeddings only
# ================================================

import pandas as pd

# Number of PCA dimensions
REDUCED_DIM = embeddings_reduced.shape[1]  # 100

# Column names for embeddings
embedding_cols = [f"emb_{i}" for i in range(REDUCED_DIM)]

# Create dataframe from PCA embeddings
embeddings_df = pd.DataFrame(embeddings_reduced, columns=embedding_cols)

# Add nct_id as first column
embeddings_df.insert(0, "nct_id", df["nct_id"].reset_index(drop=True))

# Check
#print(embeddings_df.head())
#print(f"Final dataframe shape: {embeddings_df.shape}")

output_path = "embeddings_with_nctid.csv"
embeddings_df.to_csv(output_path, index=False)

#print(f"Saved CSV to: {output_path}")
#print(f"Final dataframe shape: {embeddings_df.shape}")