# Embeddings


## Load Data


In [1]:
import sqlite3

import numpy as np
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer

from litreview import ClinicalTrials

In [2]:
db_connection = sqlite3.connect("../clinical_trials.db")
trials = ClinicalTrials(
    connection=db_connection, schema_directory="../files/schema.json"
)

columns = ["BriefTitle", "OfficialTitle", "BriefSummary"]
data = pd.DataFrame(trials.query(*columns), columns=columns)
data

Unnamed: 0,BriefTitle,OfficialTitle,BriefSummary
0,Sargramostim in Treating Patients With Chronic...,Phase II Study of GM-CSF in Patients With Chro...,"RATIONALE: Colony-stimulating factors, such as..."
1,Effect of Vardenafil on Blood Pressure in Pati...,Effect of Vardenafil on Blood Pressure in Pati...,The purpose of this study is to investigate th...
2,Investigate the Exposure to Selected Smoke Con...,"A Controlled, Randomised, Open-label, 3-arm Pa...",The overall purpose of this clinical study con...
3,Brain-imaging and Adolescent Neuroscience Cons...,Brain-imaging and Adolescent Neuroscience Cons...,This is a multi-site study of adolescents 12-2...
4,Leverage Noninvasive Transcutaneous Vagus Nerv...,Leveraging Noninvasive Transcutaneous Vagus Ne...,"Suicidal thoughts, suicide attempts, and suici..."
...,...,...,...
995,Studying DNA in Blood and Bone Marrow Samples ...,Genome-Wide Interrogations in Childhood Acute ...,RATIONALE: Studying samples of blood and bone ...
996,Comparing Conservative to Surgical Treatment o...,Comparing Effectiveness of a Conservative Poli...,RESEARCH QUESTION Is the effectiveness of cons...
997,Valuation of Efficacy and Safety of Vitamin D3...,"Randomized, Double-blind, Placebo-controlled T...",Fibromyalgia is a disease that significantly d...
998,Physical Workload Identify in Chest Compressio...,,The chest compression depth decreases over tim...


## Apply BERT Encoder

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")


def encode_bert(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.squeeze().mean(dim=0).numpy()



In [4]:
pd.options.display.max_colwidth = 1000
sample_trials = ["'NCT01126879'", "'NCT03793179'", "'NCT01801579'"]
X = pd.read_sql_query(
    f"SELECT BriefTitle, BriefSummary FROM Study WHERE NCTId IN ({', '.join(sample_trials)})",
    db_connection,
)
X["BriefTitle"]

0                                                                                     Genistein in Treating Patients With Prostate Cancer
1                                                                          Reproducibility of Ankle Brachial Index After Maximal Exercise
2    Testing the Timing of Pembrolizumab Alone or With Chemotherapy as First Line Treatment and Maintenance in Non-small Cell Lung Cancer
Name: BriefTitle, dtype: object

In [5]:
embeddings = X["BriefSummary"].apply(encode_bert).to_numpy()

In [6]:
from scipy.spatial.distance import cosine


def cosine_similarity(a, b):
    print(1 - cosine(a, b))


cosine_similarity(embeddings[0], embeddings[1])
cosine_similarity(embeddings[1], embeddings[2])
cosine_similarity(embeddings[0], embeddings[2])

0.7053055270306511
0.7376065275016618
0.8812309189689884


## Apply BERT Encoder


In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [8]:
sentence = "Here is the sentence I want embeddings for"
for token in tokenizer.tokenize(sentence):
    print(token, end=" ")

here is the sentence i want em ##bed ##ding ##s for 

In [9]:
# Example sentence.
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."

# Add special tokens.
marked_text = f"[CLS] {text} [SEP]"

# Split sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map token strings to their vocabulary indices.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
for token, index in zip(tokenized_text, indexed_tokens):
    print(f"{token:<12} {index}")

[CLS]        101
after        2044
stealing     11065
money        2769
from         2013
the          1996
bank         2924
vault        11632
,            1010
the          1996
bank         2924
robber       27307
was          2001
seen         2464
fishing      5645
on           2006
the          1996
mississippi  5900
river        2314
bank         2924
.            1012
[SEP]        102


BERT is trained on and expects sentence pairs - using 1s and 0s to distinguish between two sentences. Since we are only training on one sentence, we just need a vector of 1s for each token.


In [10]:
segments_id = [1] * len(tokenized_text)
print(segments_id)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Convert our data into torch tensors for the BERT model.


In [11]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segments_id])

In [12]:
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model.eval()



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [13]:
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs[2]

In [14]:
print(f"Number of layers: {len(hidden_states)}, (initial embeddings + 12 BERT layers)")
layer_i = 0

print(f"Number of batches: {len(hidden_states[layer_i])}")
batch_i = 0

print(f"Number of tokens: {len(hidden_states[layer_i][batch_i])}")
token_i = 0

print(f"Number of hidden units: {len(hidden_states[layer_i][batch_i][token_i])}")

Number of layers: 13, (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 22
Number of hidden units: 768


In [15]:
from transformers import pipeline

sentence = "Scarcity arises due to unlimited human wants and limited <mask> available"
unmasker = pipeline("fill-mask")
unmasker(sentence, top_k=3)

No model was supplied, defaulted to distilbert/distilroberta-base and revision ec58a5b (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `P

[{'score': 0.4738076329231262,
  'token': 1915,
  'token_str': ' resources',
  'sequence': 'Scarcity arises due to unlimited human wants and limited resources available'},
 {'score': 0.10559964925050735,
  'token': 5717,
  'token_str': ' choices',
  'sequence': 'Scarcity arises due to unlimited human wants and limited choices available'},
 {'score': 0.04467453435063362,
  'token': 1735,
  'token_str': ' options',
  'sequence': 'Scarcity arises due to unlimited human wants and limited options available'}]

## Sentence BERT with ClinicalBERT

In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("emilyalsentzer/Bio_ClinicalBERT")

No sentence-transformers model found with name emilyalsentzer/Bio_ClinicalBERT. Creating a new one with mean pooling.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [17]:
brief_summaries = X["BriefSummary"].to_numpy()
brief_summaries

array(['RATIONALE: Genistein may stop the growth of tumor cells by blocking some of the enzymes needed for cell growth.\n\nPURPOSE: This randomized phase II trial is studying how well genistein works in treating patients with prostate cancer.',
       'Hemodynamic changes in the lower limbs are very important and rapid after maximal exercise. The automatic method allows a fastest measurement of the Ankle-Brachial Index (ABI). Thus, it appears important to know whether automatic assessment of ABI is as reliable and reproducible as the manual method.',
       "This phase III trial studies whether pembrolizumab alone as a first-line treatment, followed by pemetrexed and carboplatin with or without pembrolizumab after disease progression is superior to induction with pembrolizumab, pemetrexed and carboplatin followed by pembrolizumab and pemetrexed maintenance in treating patients with stage IV non-squamous non-small cell lung cancer. Immunotherapy with monoclonal antibodies, such as pembr

In [20]:
embeddings = model.encode(brief_summaries)
embeddings

array([[-1.5263325e-02, -3.7852064e-02, -2.5886497e-01, ...,
         2.2535813e-01,  3.2457768e-04, -2.3142166e-01],
       [-6.4365245e-02, -8.7235592e-02, -1.7244610e-01, ...,
         1.7501509e-01, -6.2705778e-02, -4.0996354e-02],
       [-8.9758977e-02, -8.6140469e-02, -2.3973340e-01, ...,
         1.4750561e-01, -8.8185981e-02, -3.7618810e-01]], dtype=float32)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity([embeddings[0]], embeddings[1:])
similarity

array([[0.9147997, 0.9162271]], dtype=float32)