In [None]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch
from sklearn.metrics import roc_auc_score

In [None]:
# this notebook fine-tunes an open general text embedding model to better embed patients and trial cohorts 
# on which they might enroll closer together.

In [None]:
prefix = '/data/clin_notes_outcomes/meta/2024/v2/'

In [None]:
cohort_checks = pd.read_csv('cohort_specific_eligibility_checks.csv')

In [None]:
# frequency distribution of prior llama checks of candidate patient-cohhort combinations
cohort_checks.eligibility_result.value_counts()

In [None]:
# train only on llama-checked patient-cohort matches
train_summaries = cohort_checks[cohort_checks.split=='train']
train_summaries = train_summaries[train_summaries.eligibility_result == 1]
train_summaries.split.value_counts()

In [None]:
example_list = []
for i in range(train_summaries.shape[0]):
    example_list.append(InputExample(texts=[train_summaries.patient_summary.iloc[i], train_summaries.this_cohort.iloc[i]]))

In [None]:
import torch
temp = torch.tensor([0]).to('cuda')

In [None]:
model = SentenceTransformer("Snowflake/snowflake-arctic-embed-l", trust_remote_code=True, device='cuda')


In [None]:
train_dataloader = DataLoader(example_list, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

In [None]:
%%capture
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100)
model.save('pt_trial_summary_percohort_finetuned_2024_deid.model')

In [None]:
model = SentenceTransformer('pt_trial_summary_percohort_finetuned_2024.model', trust_remote_code=True, device='cuda')

In [None]:
# evaluate fine-tuned model on validation/tune set
validation_set = cohort_checks[cohort_checks.split=='validation']
validation_set.info()


In [None]:
patient_summary_embeddings = model.encode(validation_set.patient_summary.tolist())
trial_summary_embeddings = model.encode(validation_set.this_cohort.tolist())

In [None]:
# evaluate ability of cosine similarity to predict whether a given patient could have enrolled on a given trial cohort
import random
labels = []
similarities = []
for i in range(trial_summary_embeddings.shape[0]):
    if random.choice([0,1]) == 1:
        similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[i, :]).unsqueeze(0)))
        labels.append(1.)
    else:
        random_index = random.choice([x for x in range(0,trial_summary_embeddings.shape[0])])
        similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[random_index, :]).unsqueeze(0)))
        labels.append(0.)

In [None]:
roc_auc_score(labels, np.array([x.numpy() for x in similarities]))