In [None]:
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

In [4]:
df = pd.read_csv("../data/final_data_cleaned.csv")
print(df.shape)
df.tail()

(18766, 2)


Unnamed: 0,disease,drug
18761,vasomotor symptom,fezolinetant
18762,vasomotor symptom,gabapentin
18763,vasomotor symptom,paroxetine
18764,vasomotor symptom,progesterone
18765,vasomotor symptom,venlafaxine


In [5]:
def count_tokens(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    return len(tokens)

df['disease_token_count'] = df['disease'].apply(lambda x: count_tokens(x, tokenizer))
df['drug_token_count'] = df['drug'].apply(lambda x: count_tokens(x, tokenizer))

# Get the maximum token count in each column
max_disease_tokens = df['disease_token_count'].max()
max_drug_tokens = df['drug_token_count'].max()

print(f"Maximum token count for 'disease' column: {max_disease_tokens}")
print(f"Maximum token count for 'drug' column: {max_drug_tokens}")

Maximum token count for 'disease' column: 26
Maximum token count for 'drug' column: 13


In [6]:
def compute_embeddings(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=32).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the embeddings (use the [CLS] token), then move them back to CPU and convert to numpy
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().squeeze().numpy()
    return embeddings

In [7]:
tqdm.pandas(desc="Computing disease embeddings")
df['disease_embedding'] = df['disease'].progress_apply(lambda x: compute_embeddings(x, tokenizer, model, device))

tqdm.pandas(desc="Computing drug embeddings")
df['drug_embedding'] = df['drug'].progress_apply(lambda x: compute_embeddings(x, tokenizer, model, device))

Computing disease embeddings: 100%|██████████| 18766/18766 [02:37<00:00, 119.11it/s]
Computing drug embeddings: 100%|██████████| 18766/18766 [02:35<00:00, 121.00it/s]


In [8]:
df.tail(10)

Unnamed: 0,disease,drug,disease_token_count,drug_token_count,disease_embedding,drug_embedding
18756,vaccinia virus,cidofovir,2,2,"[-0.32823208, -0.16160358, -0.15520449, 0.0810...","[0.036254346, 0.012836881, 0.095738396, -0.363..."
18757,vaginalis,metronidazole,2,1,"[-0.22511786, 0.21200362, -0.05131736, -0.2241...","[-0.05780682, 0.05072802, 0.16187474, 0.279515..."
18758,vasomotor symptom,Elinzanetant,4,5,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[0.1310547, 0.02591556, -0.06795354, 0.2680747..."
18759,vasomotor symptom,estradiol,4,1,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[-0.42348137, 0.19635782, -0.13909128, 0.03339..."
18760,vasomotor symptom,estrogens,4,1,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[-0.07127568, 0.7490191, 0.03212757, 0.0303196..."
18761,vasomotor symptom,fezolinetant,4,5,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[-0.035705805, 0.26630142, -0.019500958, 0.047..."
18762,vasomotor symptom,gabapentin,4,1,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[-0.08150145, -0.107544206, 0.111592196, -0.01..."
18763,vasomotor symptom,paroxetine,4,2,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[-0.04558692, 0.5095056, 0.086280674, 0.170817..."
18764,vasomotor symptom,progesterone,4,1,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[-0.26757017, 0.12531605, -0.12637241, -0.0235..."
18765,vasomotor symptom,venlafaxine,4,4,"[0.022215698, 0.35601804, 0.041473925, -0.2789...","[0.16102375, 0.5103733, 0.013180696, 0.1248195..."


In [9]:
df.drop(["disease_token_count" ,"drug_token_count"], axis=1, inplace=True)

In [11]:
df.to_parquet('../data/final_data_cleaned.parquet', engine='pyarrow')