In [None]:
import pandas as pd
import numpy as np

In [None]:
# this notebook trains a smaller Transformer model to try to predict whether Llama would have 
# declared a given patient and trial cohort to be a reasonable combination
dataset = pd.read_csv('cohort_specific_eligibility_checks.csv')

In [None]:
dataset.info()

In [None]:
dataset['eligibility_result'] = dataset.eligibility_result.astype(int)

In [None]:
from transformers import AutoTokenizer

In [None]:
dataset['pt_trial_pair'] = dataset['patient_summary'] + "\nNow here is the trial cohort:" + dataset['this_cohort']

In [None]:
dataset=dataset[dataset.split != 'test']
dataset = dataset[['pt_trial_pair', 'eligibility_result', 'split']].rename(columns={'pt_trial_pair':'text','eligibility_result':'label'})

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
train_ds = Dataset.from_pandas(dataset[dataset.split=='train'])
valid_ds = Dataset.from_pandas(dataset[dataset.split=='validation'])

In [None]:
data_dict = DatasetDict({"train":train_ds, "valid":valid_ds})

In [None]:
data_dict

In [None]:
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
#tokenizer.pad_token = tokenizer.eos_token

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

In [None]:
tokenized_data = data_dict.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
auroc = evaluate.load('roc_auc')

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #predictions = np.argmax(predictions, axis=1)
    return auroc.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "medicalai/ClinicalBERT", num_labels=2, id2label=id2label, label2id=label2id
)
#model.config.pad_token_id = model.config.eos_token_id


In [None]:
training_args = TrainingArguments(
    output_dir="bert_checker",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    #evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    push_to_hub=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    #eval_dataset=tokenized_data["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator
    #compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# evaluate bert-checker model on the task of predicting llama's evaluation of whether a given patient-cohort
# combination is reasonable
predictions = trainer.predict(tokenized_data['valid'])

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(predictions[1], predictions[0][:,1])

In [None]:
model.save_pretrained('bert-checker')