# Exploring Terms in the Encyclopaedia Britannica

## Similar terms within an edition - BERT - Transformers


https://theaidigest.in/how-to-do-semantic-document-similarity-using-bert/

### Loading the necessary libraries

In [1]:
import yaml
import matplotlib.pyplot as plt
import numpy as np
import collections
import matplotlib as mpl

In [2]:
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
import os

In [4]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

### Functions

In [5]:
def get_document(uri):
    uri="<"+uri+">"
    sparql = SPARQLWrapper("http://35.228.63.82:3030/eb1/sparql")
    query="""
    PREFIX eb: <https://w3id.org/eb#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?definition ?term
        WHERE {{
            %s a eb:Article ;
               eb:name ?term ;
               eb:definition ?definition . 
            }
            UNION {
            %s a eb:Topic ;
              eb:name ?term ; 
              eb:definition ?definition . 
            }
       } 
    """ %(uri, uri)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    term = results["results"]["bindings"][0]["term"]["value"]
    definition=results["results"]["bindings"][0]["definition"]["value"]
    return term, definition

## We have dataframe with these columns

- definition:           Definition of a term
- editionNum:           1,2,3,4,5,6,7,8
- editionTitle:         Title of the edition
- header:               Header of the page's term                                  
- place:                Place where the volume was edited (e.g. Edinburgh)                                    
- relatedTerms:         Related terms (see X article)  
- altoXML:              File Path of the XML file from which the term belongs       
- term:                 Term name                            
- positionPage:         Position of ther term in the page     
- startsAt:             Number page in which the term definition starts 
- endsAt:               Number page in which the term definition ends 
- volumeTitle:          Title of the Volume
- typeTerm:             Type of term [Topic| Articles]                                       
- year:                 Year of the edition
- volumeNum:            Volume number (e.g. 1)
- letters:              leters of the volume (A-B)
- part:                 Part of the volume (e.g 1)
- supplement:           Supplement's Title
- supplementsTo:        It suppelements to editions [1, 2, 3....]
- numberOfWords:        Number of words per term definition
- numberOfTerms:        Number of terms per page
- numberOfPages:        Number of pages per volume

In [6]:
import rdflib
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx
import matplotlib.pyplot as pl
from rdflib import Graph, ConjunctiveGraph, Namespace, Literal
from rdflib.plugins.sparql import prepareQuery

In [7]:
import networkx as nx
import matplotlib.pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

### 2.  Selecting just the 100 first elements of  the first volume of 1771

In [8]:
sparql = SPARQLWrapper("http://35.228.63.82:3030/eb1/sparql")
query="""
PREFIX eb: <https://w3id.org/eb#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?definition ?uri ?term ?vnum ?year ?enum ?letters ?part
        WHERE {{
    	?uri a eb:Article .
    	?uri eb:name ?term .
        ?uri eb:definition ?definition . 
        ?v eb:hasPart ?uri.
        ?v eb:number ?vnum.
        ?v eb:letters ?letters .
        ?e eb:hasPart ?v.
        ?e eb:publicationYear ?year.
        ?e eb:number ?enum.
        OPTIONAL {?v eb:part ?part; }
        }
  		UNION {
    	?uri a eb:Topic .
    	?uri eb:name ?term . 
        ?uri eb:definition ?definition .
        ?v eb:hasPart ?uri.
        ?v eb:number ?vnum.
        ?v eb:letters ?letters .
        ?e eb:hasPart ?v.
        ?e eb:publicationYear ?year.
        ?e eb:number ?enum.
        OPTIONAL {?v eb:part ?part; }
        
        }
   }
""" 
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results = sparql.query().convert()
documents=[]
terms_info=[]
uris=[]
for r in results["results"]["bindings"]:
    documents.append(r["definition"]["value"])
    uris.append(r["uri"]["value"])
    if "part" in r:
        terms_info.append([r["term"]["value"], r["enum"]["value"], r["year"]["value"], r["part"]["value"], r["vnum"]["value"], r["letters"]["value"]])
    else:
        terms_info.append([r["term"]["value"], r["enum"]["value"], r["year"]["value"], "" , r["vnum"]["value"], r["letters"]["value"]])


In [9]:
len(uris)
len(terms_info)

18117

In [10]:
import pickle
with open('documents_1ed.txt', 'wb') as fp:
    pickle.dump(documents, fp)
    
with open('terms_info_1ed.txt', 'wb') as fp2:
    pickle.dump(terms_info, fp2)
    
with open('uris_1ed.txt', 'wb') as fp3:
    pickle.dump(uris, fp3)

#### 3.1 Train Corpus

In [11]:

model = SentenceTransformer('emanjavacas/MacBERTh')

Downloading (…)2f64d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)7edf02f64d/README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)df02f64d/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading (…)7edf02f64d/vocab.txt:   0%|          | 0.00/227k [00:00<?, ?B/s]

No sentence-transformers model found with name /Users/ly40/.cache/torch/sentence_transformers/emanjavacas_MacBERTh. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/ly40/.cache/torch/sentence_transformers/emanjavacas_MacBERTh were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a 

In [12]:
text_embeddings = model.encode(documents, batch_size = 8, show_progress_bar = True)

Batches:   0%|          | 0/2265 [00:00<?, ?it/s]

In [13]:
np.shape(text_embeddings)

(18117, 768)

In [14]:
all_embeddings_1ed = np.array(text_embeddings)
np.save('embeddings_1ed.npy', all_embeddings_1ed)

In [15]:
uri="https://w3id.org/eb/i/Article/9929192893804340_144850368_PAISLEY_0"
term, definition=get_document(uri)
definition_embedding= model.encode(definition, batch_size = 8, show_progress_bar = True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
similarity_def=cosine_similarity(
    [definition_embedding],
    text_embeddings)

In [17]:
similarities = cosine_similarity(text_embeddings)
print('pairwise dense output:\n {}\n'.format(similarities))

pairwise dense output:
 [[0.9999994  0.87249523 0.84896547 ... 0.813816   0.84602416 0.8092604 ]
 [0.87249523 1.0000001  0.9066804  ... 0.8861644  0.9074524  0.86981225]
 [0.84896547 0.9066804  0.9999995  ... 0.86914456 0.88573086 0.8514845 ]
 ...
 [0.813816   0.8861644  0.86914456 ... 1.         0.94909406 0.9389284 ]
 [0.84602416 0.9074524  0.88573086 ... 0.94909406 1.0000007  0.9250872 ]
 [0.8092604  0.86981225 0.8514845  ... 0.9389284  0.9250872  0.99999976]]



In [18]:
similarities_sorted = similarities.argsort()
similarities_sorted

array([[16146,  9183, 12086, ..., 16384,  7527,     0],
       [12086,  9183,  3011, ...,  5019,  7302,     1],
       [ 3011,  9183, 12086, ...,  9304, 17531,     2],
       ...,
       [16146,  9183, 11428, ...,  9964,  7100, 18114],
       [ 9183, 11428, 16146, ...,  9619, 18032, 18115],
       [16146,  3011, 12086, ...,  8142, 18047, 18116]])

In [19]:
id_1 = []
id_2 = []
score = []
for index,array in enumerate(similarities_sorted):
    p=len(array)
    id_1.append(index)
    id_2.append(array[-2])
    score.append(similarities[index][array[-2]])
index_df = pd.DataFrame({'id_1' : id_1,
                          'id_2' : id_2,
                          'score' : score})
print(p)

NameError: name 'pd' is not defined

In [None]:
index_df

In [None]:
documents[2]

In [None]:
documents[13278]