In [11]:
!pip install datasets
!pip install transformers

#pip install numpy
#!ls /content/

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m19.9 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━

The dataset

In [12]:
import pandas as pd
import numpy as np
import os
import csv


DATA_PATH = "/home/ruima/code/delaunan/clintrialpredict/data"
df = pd.read_csv(os.path.join(DATA_PATH, 'project_data.csv'))
df.columns
#df.shape
#df.head

Index(['nct_id', 'study_type', 'overall_status', 'phase', 'number_of_arms',
       'why_stopped', 'target', 'start_year', 'best_pathology',
       'therapeutic_area', 'therapeutic_subgroup', 'therapeutic_subgroup_name',
       'competition_broad', 'competition_niche', 'gender',
       'healthy_volunteers', 'adult', 'child', 'older_adult',
       'num_primary_endpoints', 'min_p_value', 'phase_ordinal',
       'covid_exposure', 'is_international', 'agency_class', 'allocation',
       'intervention_model', 'primary_purpose', 'masking', 'txt_tags',
       'txt_criteria'],
      dtype='object')

1. Python code structure for loading the data, initializing the BioBERT components, and tokenizing the text features using the Hugging Face transformers and datasets libraries.

   This structure assumes that the the dataframe is named df.

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Assume 'df' is your DataFrame loaded from the SQL query

# --- 1. Create the Combined Text Feature (X) ---
# Fill any NaNs in text columns with an empty string so concatenation works
df['conditions_text'] = df['conditions_text'].fillna('')
df['interventions_text'] = df['interventions_text'].fillna('')
df['brief_summary_text'] = df['brief_summary_text'].fillna('')

# Concatenate all relevant text features into one column
df['text_input'] = (
    "Title: " + df['title'] +
    " | Summary: " + df['brief_summary_text'] +
    " | Conditions: " + df['conditions_text'] +
    " | Interventions: " + df['interventions_text'])

# --- 2. Create the Numerical Target (Y) ---
# Create the binary target (0 or 1) from the categorical 'overall_status'
# This is a simplified example; refine based on your EDA/definition
df['label'] = df['overall_status'].apply(
    lambda x: 1 if x in ['Completed', 'Active, not recruiting'] else 0
)

# Filter out rows where the target is undefined or incomplete
df = df[df['text_input'].str.len() > 100].reset_index(drop=True)

# Convert the Pandas DataFrame to a Hugging Face Dataset object
hg_dataset = Dataset.from_pandas(df[['text_input', 'label']])

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


KeyError: 'conditions_text'

2. Load Tokenizer and Define Tokenization Function

   BioBERT tokenizer will be loaded and create a function to apply it to the dataset.

In [None]:
# --- 3. Load BioBERT Tokenizer ---
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.2"
MAX_LENGTH = 512  # Standard for BERT models

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- 4. Define Tokenization Function ---
def tokenize_function(examples):
    """
    Tokenizes the text input, truncates to max length, and adds necessary
    BERT-specific tokens ([CLS], [SEP]).
    """
    return tokenizer(
        examples["text_input"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

# Apply the tokenization across the entire dataset
tokenized_datasets = hg_dataset.map(tokenize_function, batched=True)

3. Final Dataset Preparation

   The final step is to prepare the dataset for training, including splitting it and selecting the columns the model expects.

In [None]:
# --- 5. Format and Split Data ---
# Rename the 'label' column to 'labels' as required by the Trainer class
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Remove the original text column and unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["text_input", "__index_level_0__"])

# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")

# Split into training and testing sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, seed=42)

train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Training set size: {len(train_dataset)}")
print(f"Evaluation set size: {len(eval_dataset)}")
print("\nExample Tokenized Data Structure:")
print(train_dataset[0].keys())

1. Fine-Tuning

Defines the Evaluation Metrics (compute_metrics function)
The Hugging Face Trainer requires a function, compute_metrics, that accepts an EvalPrediction object (containing raw model predictions/logits and true labels) and returns a dictionary of metric scores.

For this project, the key metrics should be F1 Score and AUC-ROC (Area Under the Receiver Operating Characteristic Curve).

F1 Score: The harmonic mean of Precision and Recall. It penalizes models that favor one metric over the other (e.g., high Recall but low Precision), making it robust for imbalanced datasets.

AUC-ROC: Measures the model's ability to discriminate between positive and negative classes across all possible classification thresholds. A score of $1.0$ is perfect; $0.5$ is random. This is highly recommended for evaluating risk-modeling.

2. Evaluation Setup

Configure Training Arguments (TrainingArguments)

This object specifies all the hyperparameters, saving, and logging strategies for the Trainer. These values are good starting points for a Transformer model fine-tuning.

In [None]:
from transformers import TrainingArguments, Trainer

# --- 3. Define Training Arguments ---
training_args = TrainingArguments(
    output_dir="./biobert_clinical_trials_output",  # Required: Directory to save checkpoints/results
    num_train_epochs=3,                            # Start with 3 epochs (common for fine-tuning)
    per_device_train_batch_size=16,                # Adjust based on your GPU memory
    per_device_eval_batch_size=16,
    warmup_steps=500,                              # A few hundred steps for learning rate warm-up
    weight_decay=0.01,
    learning_rate=2e-5,                            # Standard low learning rate for fine-tuning
    evaluation_strategy="epoch",                   # Evaluate metrics after each full epoch
    logging_strategy="steps",
    logging_steps=100,                             # Log training loss every 100 steps
    save_strategy="epoch",
    load_best_model_at_end=True,                   # Load the model with the best validation metric
    metric_for_best_model="f1_weighted",           # Specify which metric to track for "best model"
    report_to="tensorboard",                       # Optional: Visualize training progress
)

# --- 4. Initialize the Trainer ---
# Assuming 'train_dataset' and 'eval_dataset' are the tokenized datasets from the previous step
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer, # Pass the tokenizer for padding/saving
    compute_metrics=compute_metrics, # Pass the custom metrics function
)

# --- 5. Start Fine-Tuning ---
# trainer.train() # This is the command that initiates Week 1's model training!