In [2]:
# !pip install transformers

In [1]:
import torch
from transformers import BertModel, BertConfig, BertTokenizer

In [2]:
# utility function for getting segments
def get_segments(tokens):
    # print("get_segments")
    print(tokens)
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id 
    return (seg_ids)

  and should_run_async(code)


In [3]:
def get_ids(tokens, tokenizer):
    return tokenizer.convert_tokens_to_ids(tokens)

In [4]:
def encode_sentence(sent, tokenizer):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [5]:
def get_model(model_string = 'bert-base-uncased'):
  config = BertConfig.from_pretrained(model_string, output_hidden_states=True)
  model = BertModel.from_pretrained(model_string, config=config)
  tokenizer = BertTokenizer.from_pretrained(model_string)
  return (model, tokenizer, config)

In [6]:
def get_sentence_embedding(sent, model, tokenizer, config):

  tokens = encode_sentence(sent, tokenizer)
  segments_idx = get_segments(tokens)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_idx])
  model.eval()

  with torch.no_grad():
    outputs  = model(tokens_tensor, segments_tensors)
  embeddings_of_last_layer = outputs[0]
  cls_embeddings = embeddings_of_last_layer[0]
  last_hidden_states = outputs[0] 
  hidden_states = outputs[2]
  embedding_output = hidden_states[0]
  encoded_layers = attention_hidden_states = hidden_states[1:]
  # BERT has twelve (in this case) layers, we are considering Second Last layer.
  token_vecs = encoded_layers[10][0] # encoded_layers[11][0]
  # token_vecs = cls_embeddings
  sentence_embedding = torch.mean(token_vecs, dim=0) # Calculating average across the sentence.
  return(sentence_embedding)

In [7]:
def get_document_embedding(lstdocuments, model, tokenizer, config ):
  docembeddings = []
  for doc in lstdocuments:
    docembeddings.append(get_sentence_embedding(doc, model, tokenizer, config))
    
  return(docembeddings)

In [8]:
lst_corpus = []
import os
from nltk.tokenize import sent_tokenize

path='C:/Users/Priyanka/Desktop/ML_DEPL/text-analysis/summary/'

filelist = os.listdir(path)
for file in filelist:
    f = open(path+file, "r")
    text=f.read()
    lst_corpus+=sent_tokenize(text)

print(lst_corpus)
     
test_sentences = ["World Health Organisation concerned about covid?", "Corona virus vaccines global impact.", "US strategy for covid."]

['Success from two leading coronavirus vaccine programs likely means other frontrunners will also show strong protection against COVID-19, Bill Gates said Tuesday.', 'The fact that two coronavirus vaccines recently showed strong protection against COVID-19 bodes well for other leading programs led by AstraZeneca, Novavax, and Johnson & Johnson, Bill Gates said Tuesday.The billionaire Microsoft founder and philanthropist said it will be easier to boost manufacturing and distribute these other shots to the entire world, particularly developing nations.The vaccine space has seen a flurry of good news in recent days, marked by overwhelming success in late-stage trials by both Pfizer and Moderna.', '"With the very good news from Pfizer and Moderna, we think it\'s now likely that AstraZeneca, Novavax, and Johnson & Johnson will also likely show very strong efficacy," Gates told journalist Andrew Ross Sorkin.', 'World Health Assembly charts course for COVID-19 response and global health prior

In [12]:
model, tokenizer, config = get_model();
test_embeds = get_document_embedding(test_sentences, model, tokenizer, config)
doc_embeds = get_document_embedding(lst_corpus, model, tokenizer, config)

Downloading: 100%|██████████| 433/433 [00:00<00:00, 144kB/s]
Downloading: 100%|██████████| 440M/440M [01:25<00:00, 5.17MB/s]
Downloading: 100%|██████████| 232k/232k [00:01<00:00, 137kB/s] 
['[CLS]', 'world', 'health', 'organisation', 'concerned', 'about', 'co', '##vid', '?', '[SEP]']
['[CLS]', 'corona', 'virus', 'vaccines', 'global', 'impact', '.', '[SEP]']
['[CLS]', 'us', 'strategy', 'for', 'co', '##vid', '.', '[SEP]']
['[CLS]', 'success', 'from', 'two', 'leading', 'corona', '##virus', 'vaccine', 'programs', 'likely', 'means', 'other', 'front', '##runner', '##s', 'will', 'also', 'show', 'strong', 'protection', 'against', 'co', '##vid', '-', '19', ',', 'bill', 'gates', 'said', 'tuesday', '.', '[SEP]']
['[CLS]', 'the', 'fact', 'that', 'two', 'corona', '##virus', 'vaccines', 'recently', 'showed', 'strong', 'protection', 'against', 'co', '##vid', '-', '19', 'bo', '##des', 'well', 'for', 'other', 'leading', 'programs', 'led', 'by', 'as', '##tra', '##zen', '##eca', ',', 'nova', '##va', '##x

In [13]:
import scipy
from scipy.spatial.distance import cosine

def calculate_distances(query_embedding, document_emdeddings):
  distances_c = []  
  for docembed in document_emdeddings:
    distances_c.append(scipy.spatial.distance.cosine(query_embedding, docembed))
  return(distances_c)

In [14]:
closest_n = 3
for query, query_embedding in zip(test_sentences, test_embeds):
    distances = calculate_distances(query_embedding, doc_embeds)

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop %s most similar sentences in corpus:\n" % closest_n)

    for idx, distance in results[0:closest_n]:
        print(lst_corpus[idx].strip(), "(Score: %.4f)" % (1-distance))





Query: World Health Organisation concerned about covid?

Top 3 most similar sentences in corpus:

World Health Assembly charts course for COVID-19 response and global health priorities. (Score: 0.7883)
Coronavirus vaccines: Will any countries get left out? (Score: 0.7868)
A landmark global vaccine plan known as Covax is seeking to ensure an equitable distribution of future coronavirus vaccines. (Score: 0.7468)




Query: Corona virus vaccines global impact.

Top 3 most similar sentences in corpus:

Coronavirus: Belgium facing 'tsunami' of new infections. (Score: 0.8036)
A landmark global vaccine plan known as Covax is seeking to ensure an equitable distribution of future coronavirus vaccines. (Score: 0.8020)
World Health Assembly charts course for COVID-19 response and global health priorities. (Score: 0.7999)




Query: US strategy for covid.

Top 3 most similar sentences in corpus:

World Health Assembly charts course for COVID-19 response and global health priorities. (Score: 0.