# Preliminary Sentence Bert Analysis

Author: Crystal

In [1]:
#bert
from sentence_transformers import SentenceTransformer, util
import torch

import pandas as pd

import re

import nltk
#nltk.download() #input: punkt

from nltk import tokenize

#embeddings
embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
#embedder = SentenceTransformer("allenai/scibert_scivocab_uncased")

## Functions

In [21]:
def get_corpus_embeddings(dir):
    with open(dir) as f:
        ai_text = f.read()
    ai_corpus = tokenize.sent_tokenize(ai_text) #sentence tokenization
    ai_embeddings = embedder.encode(ai_corpus, show_progress_bar=True) # embeddings
    return ai_embeddings


ai_embeddings = get_corpus_embeddings("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text.txt")

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
# k: number of similar sentences from AI corpus
# abstract: abstract from FEDERAL RePORTER
# print_result: if TRUE, print out the similar sentenses from AI corpus to each sentence in the abstract
def get_score(k, abstract, print_result = False):
    queries = tokenize.sent_tokenize(abstract) 

    # init a result list for scores
    result = []
    
    # Find the closest k sentences of the AI corpus for each query sentence (ML) based on cosine similarity
    top_k = min(k, len(ai_corpus))
    
    for query in queries: #compare each sentence in the abstract to the ai corpus
        query_embedding = embedder.encode(query, show_progress_bar=False) 
        
        # We use cosine-similarity and torch.topk to find the highest k scores
        cos_scores = util.pytorch_cos_sim(query_embedding, ai_embeddings)[0]
        
        top_results = torch.topk(cos_scores, k=top_k)   #get the top k scores
        result.append(top_results.values.tolist()) #unlist the top result list
        if print_result:
            print("\n\n======================\n\n")
            print("Query:", query)
            print("Results:", top_results)
            print("\nTop k=5 most similar sentences in corpus:")
            for score, idx in zip(top_results[0], top_results[1]):
                print(ai_corpus[idx], "(Score: {:.4f})".format(score))
    return result

## AI related articles and clean

### Example 1. Abstract on "FROM ALPHAGO TO POWER SYSTEM ARTIFICIAL INTELLIGENCE"

In [8]:
abstracts = pd.read_pickle("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/smaller-final-dataset.pkl")

KeyboardInterrupt: 

In [13]:
abstracts[abstracts["PROJECT_TITLE"] =="FROM ALPHAGO TO POWER SYSTEM ARTIFICIAL INTELLIGENCE"]

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed,PROJECT_TERMS,PROJECT_TITLE,...,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
531810,1062091,1088974,The game of Go is an ancient board game which ...,2018,1,1,2379,"[game, ancient, board, game, consider, far, bo...",Address; Algorithms; Area; Artificial Intelli...,FROM ALPHAGO TO POWER SYSTEM ARTIFICIAL INTELL...,...,KNOXVILLE,TN,37996-0003,UNITED STATES,,,47.041,2018,330000.0,


We feed in ML Wiki article as a query.
For each sentence in the ML Wiki article, we identify the top 5 most similar/closest sentences from AI Wiki article based on cosine similarity

In [14]:
ai_related_p = abstracts["ABSTRACT"][531850]
ai_related_p

'Project SummaryGeneration of the eye, more so than many organs, requires precise control of its shape for optimal function.Obtaining knowledge of how the eye and lens is constructed during embryonic development is thereforeimportant to help describe the nature of ocular abnormalities that lead to major structural defects or moresubtle changes that alter vision. An example of a morphogenetic event required for the generation of organs isepithelial invagination. This process drives the inward bending of epithelia of several early organ systemsincluding that of the lens placode during early ocular development. Although several mechanisms have beenproposed to drive this process, such as apical constriction or local placodal growth, none have been foundsufficient to account for epithelial bending. We have recently observed that placodal cells change shape,move, and generate cytoskeletal structures in a planar polarized manner that produces a net flow of cellstoward the central placode. One

In [63]:
get_score(5, ai_related_p, print_result = False)

['Project SummaryGeneration of the eye, more so than many organs, requires precise control of its shape for optimal function.Obtaining knowledge of how the eye and lens is constructed during embryonic development is thereforeimportant to help describe the nature of ocular abnormalities that lead to major structural defects or moresubtle changes that alter vision.', 'An example of a morphogenetic event required for the generation of organs isepithelial invagination.', 'This process drives the inward bending of epithelia of several early organ systemsincluding that of the lens placode during early ocular development.', 'Although several mechanisms have beenproposed to drive this process, such as apical constriction or local placodal growth, none have been foundsufficient to account for epithelial bending.', 'We have recently observed that placodal cells change shape,move, and generate cytoskeletal structures in a planar polarized manner that produces a net flow of cellstoward the central

## Example 2. Abstract on "STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE"

In [53]:
abstract_text = abstracts["ABSTRACT"][0]#get the text
abstract_text

"The multiprotein complex y-secretase proteolytically cleaves the intramembrane region of amyloid precursorprotein (APP), which in turn forms the plaques found in Alzheimer's disease (AD) patients. The catalyticcomponent of Y-secretase is the intramembrane aspartyl protease (IAP) called presenilin. Mutations inpresenilin are directly linked to familial early-onset AD. Another known member of the IAP family is signalpeptide peptidase (SPP), which functions to further proteolyze remnant signal peptides after they have beencleaved by signal peptidase. Knowledge of the biochemistry and function of individual SPPs are onlybeginning to be elucidated, and homologues are found in all kingdoms of life. Presenilin and SPP exhibitsignificant sequence similarity, strongly suggesting they share structural and catalytic features. Thus, amolecular understanding of the more tractable SPP will likely impact drug design for presenilin and y-secretase. The goal of this proposal is to express, characteriz

In [54]:
get_score(5, abstract_text, print_result = False)

["The multiprotein complex y-secretase proteolytically cleaves the intramembrane region of amyloid precursorprotein (APP), which in turn forms the plaques found in Alzheimer's disease (AD) patients.", 'The catalyticcomponent of Y-secretase is the intramembrane aspartyl protease (IAP) called presenilin.', 'Mutations inpresenilin are directly linked to familial early-onset AD.', 'Another known member of the IAP family is signalpeptide peptidase (SPP), which functions to further proteolyze remnant signal peptides after they have beencleaved by signal peptidase.', 'Knowledge of the biochemistry and function of individual SPPs are onlybeginning to be elucidated, and homologues are found in all kingdoms of life.', 'Presenilin and SPP exhibitsignificant sequence similarity, strongly suggesting they share structural and catalytic features.', 'Thus, amolecular understanding of the more tractable SPP will likely impact drug design for presenilin and y-secretase.', 'The goal of this proposal is t

In [70]:
get_score(queries, print_result=False)

KeyboardInterrupt: 

### Example


In [9]:
text_Basilica = "Pope Julius' scheme for the grandest building in Christendom was the subject of a competition for which a number of entries remain intact in the Uffizi Gallery, Florence. It was the design of Donato Bramante that was selected, and for which the foundation stone was laid in 1506. This plan was in the form of an enormous Greek Cross with a dome inspired by that of the huge circular Roman temple, the Pantheon.[7] The main difference between Bramante's design and that of the Pantheon is that where the dome of the Pantheon is supported by a continuous wall, that of the new basilica was to be supported only on four large piers. This feature was maintained in the ultimate design. Bramante's dome was to be surmounted by a lantern with its own small dome but otherwise very similar in form to the Early Renaissance lantern of Florence Cathedral designed for Brunelleschi's dome by Michelozzo. Bramante had envisioned that the central dome would be surrounded by four lower domes at the diagonal axes. The equal chancel, nave and transept arms were each to be of two bays ending in an apse. At each corner of the building was to stand a tower, so that the overall plan was square, with the apses projecting at the cardinal points. Each apse had two large radial buttresses, which squared off its semi-circular shape."

In [10]:
queries = tokenize.sent_tokenize(text_Basilica)
print(queries[0:10])
print(len(queries))

["Pope Julius' scheme for the grandest building in Christendom was the subject of a competition for which a number of entries remain intact in the Uffizi Gallery, Florence.", 'It was the design of Donato Bramante that was selected, and for which the foundation stone was laid in 1506.', 'This plan was in the form of an enormous Greek Cross with a dome inspired by that of the huge circular Roman temple, the Pantheon.', "[7] The main difference between Bramante's design and that of the Pantheon is that where the dome of the Pantheon is supported by a continuous wall, that of the new basilica was to be supported only on four large piers.", 'This feature was maintained in the ultimate design.', "Bramante's dome was to be surmounted by a lantern with its own small dome but otherwise very similar in form to the Early Renaissance lantern of Florence Cathedral designed for Brunelleschi's dome by Michelozzo.", 'Bramante had envisioned that the central dome would be surrounded by four lower domes

In [15]:
get_score(4, queries, print_result = True)





Query: Pope Julius' scheme for the grandest building in Christendom was the subject of a competition for which a number of entries remain intact in the Uffizi Gallery, Florence.
Results: torch.return_types.topk(
values=tensor([0.2273, 0.2086, 0.2030, 0.2018]),
indices=tensor([ 53, 369,  22, 211]))

Top k=5 most similar sentences in corpus:
quiz show exhibition match, IBM's question answering system, Watson, defeated the two greatest Jeopardy! (Score: 0.2273)
Thought-capable artificial beings appeared as storytelling devices since antiquity, 
and have been a persistent theme in science fiction. (Score: 0.2086)
These issues have been explored by myth, fiction and philosophy since antiquity. (Score: 0.2030)
This tradition, centered at Carnegie Mellon University would eventually culminate in the development of the Soar architecture in the middle 1980s. (Score: 0.2018)




Query: It was the design of Donato Bramante that was selected, and for which the foundation stone was laid in 1506.

[[0.22725766897201538,
  0.2086402177810669,
  0.20304988324642181,
  0.20184537768363953],
 [0.17468996345996857,
  0.16498778760433197,
  0.15678071975708008,
  0.15291136503219604],
 [0.2660845220088959,
  0.25608712434768677,
  0.2545034885406494,
  0.24469813704490662],
 [0.2857217490673065,
  0.22828024625778198,
  0.18599683046340942,
  0.1650390774011612],
 [0.38453492522239685,
  0.35602161288261414,
  0.3248547911643982,
  0.31567105650901794],
 [0.30394724011421204,
  0.14987064898014069,
  0.1389949917793274,
  0.13767722249031067],
 [0.33625566959381104,
  0.18328112363815308,
  0.17916186153888702,
  0.1731284260749817],
 [0.19776922464370728,
  0.17027877271175385,
  0.1624986231327057,
  0.16105715930461884],
 [0.3348667323589325,
  0.26839780807495117,
  0.2606987953186035,
  0.21333608031272888],
 [0.2252057045698166,
  0.19740983843803406,
  0.19238469004631042,
  0.17855817079544067]]

In [None]:
result = []

# Find the closest k sentences of the AI corpus for each query sentence (ML) based on cosine similarity
top_k = min(k, len(ai_corpus))

query = queries[0]

query_embedding = embedder.encode(query, show_progress_bar=False) 

# We use cosine-similarity and torch.topk to find the highest k scores
cos_scores = util.pytorch_cos_sim(query_embedding, ai_embeddings)[0]

top_results = torch.topk(cos_scores, k=top_k)   #get the top k scores
result.append(top_results.values.tolist()) #unlist the top result list

print("\n\n======================\n\n")
print("Query:", query)
print("Results:", top_results)
print("\nTop k=5 most similar sentences in corpus:")
for score, idx in zip(top_results[0], top_results[1]):
    print(ai_corpus[idx], "(Score: {:.4f})".format(score))
