# Exploring Terms in the Encyclopaedia Britannica

## Similar terms within an edition - BERT - Transformers


https://theaidigest.in/how-to-do-semantic-document-similarity-using-bert/

### Loading the necessary libraries

In [29]:
import yaml
import matplotlib.pyplot as plt
import numpy as np
import collections
import matplotlib as mpl

In [30]:
import networkx as nx
import matplotlib.pyplot as plt

In [31]:
import os

In [32]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

### Functions

In [33]:
def get_document(uri):
    uri="<"+uri+">"
    sparql = SPARQLWrapper("http://localhost:3030/edition1st/sparql")
    query="""
    PREFIX eb: <https://w3id.org/eb#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?definition ?term
        WHERE {{
            %s a eb:Article ;
               eb:name ?term ;
               eb:definition ?definition . 
            }
            UNION {
            %s a eb:Topic ;
              eb:name ?term ; 
              eb:definition ?definition . 
            }
       } 
    """ %(uri, uri)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    term = results["results"]["bindings"][0]["term"]["value"]
    definition=results["results"]["bindings"][0]["definition"]["value"]
    return term, definition

## We have dataframe with these columns

- definition:           Definition of a term
- editionNum:           1,2,3,4,5,6,7,8
- editionTitle:         Title of the edition
- header:               Header of the page's term                                  
- place:                Place where the volume was edited (e.g. Edinburgh)                                    
- relatedTerms:         Related terms (see X article)  
- altoXML:              File Path of the XML file from which the term belongs       
- term:                 Term name                            
- positionPage:         Position of ther term in the page     
- startsAt:             Number page in which the term definition starts 
- endsAt:               Number page in which the term definition ends 
- volumeTitle:          Title of the Volume
- typeTerm:             Type of term [Topic| Articles]                                       
- year:                 Year of the edition
- volumeNum:            Volume number (e.g. 1)
- letters:              leters of the volume (A-B)
- part:                 Part of the volume (e.g 1)
- supplement:           Supplement's Title
- supplementsTo:        It suppelements to editions [1, 2, 3....]
- numberOfWords:        Number of words per term definition
- numberOfTerms:        Number of terms per page
- numberOfPages:        Number of pages per volume

In [34]:
import rdflib
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx
import matplotlib.pyplot as pl
from rdflib import Graph, ConjunctiveGraph, Namespace, Literal
from rdflib.plugins.sparql import prepareQuery

In [35]:
import networkx as nx
import matplotlib.pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

### 2.  Selecting just the 100 first elements of  the first volume of 1771

In [36]:
sparql = SPARQLWrapper("http://localhost:3030/edition1st/sparql")
query="""
PREFIX eb: <https://w3id.org/eb#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?definition ?uri ?term ?vnum ?year ?enum ?letters ?part
        WHERE {{
    	?uri a eb:Article .
    	?uri eb:name ?term .
        ?uri eb:definition ?definition . 
        ?v eb:hasPart ?uri.
        ?v eb:number ?vnum.
        ?v eb:letters ?letters .
        ?e eb:hasPart ?v.
        ?e eb:publicationYear ?year.
        ?e eb:number ?enum.
        OPTIONAL {?v eb:part ?part; }
        }
  		UNION {
    	?uri a eb:Topic .
    	?uri eb:name ?term . 
        ?uri eb:definition ?definition .
        ?v eb:hasPart ?uri.
        ?v eb:number ?vnum.
        ?v eb:letters ?letters .
        ?e eb:hasPart ?v.
        ?e eb:publicationYear ?year.
        ?e eb:number ?enum.
        OPTIONAL {?v eb:part ?part; }
        
        }
   }
""" 
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results = sparql.query().convert()
documents=[]
terms_info=[]
uris=[]
for r in results["results"]["bindings"]:
    documents.append(r["definition"]["value"])
    uris.append(r["uri"]["value"])
    if "part" in r:
        terms_info.append([r["term"]["value"], r["enum"]["value"], r["year"]["value"], r["part"]["value"], r["vnum"]["value"], r["letters"]["value"]])
    else:
        terms_info.append([r["term"]["value"], r["enum"]["value"], r["year"]["value"], "" , r["vnum"]["value"], r["letters"]["value"]])


In [37]:
len(uris)
len(terms_info)

18117

In [38]:
import pickle
with open('documents_1ed.txt', 'wb') as fp:
    pickle.dump(documents, fp)
    
with open('terms_info_1ed.txt', 'wb') as fp2:
    pickle.dump(terms_info, fp2)
    
with open('uris_1ed.txt', 'wb') as fp3:
    pickle.dump(uris, fp3)

#### 3.1 Train Corpus

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
text_embeddings = model.encode(documents, batch_size = 8, show_progress_bar = True)

In [None]:
np.shape(text_embeddings)

In [None]:
all_embeddings_1ed = np.array(text_embeddings)
np.save('embeddings_1ed.npy', all_embeddings_1ed)

In [None]:
uri="https://w3id.org/eb/i/Article/9929192893804340_144850368_PAISLEY_0"
term, definition=get_document(uri)
definition_embedding= model.encode(definition, batch_size = 8, show_progress_bar = True)

In [None]:
similarity_def=cosine_similarity(
    [definition_embedding],
    text_embeddings)

In [None]:
similarities = cosine_similarity(text_embeddings)
print('pairwise dense output:\n {}\n'.format(similarities))

In [None]:
similarities_sorted = similarities.argsort()
similarities_sorted

In [None]:
id_1 = []
id_2 = []
score = []
for index,array in enumerate(similarities_sorted):
    p=len(array)
    id_1.append(index)
    id_2.append(array[-2])
    score.append(similarities[index][array[-2]])
index_df = pd.DataFrame({'id_1' : id_1,
                          'id_2' : id_2,
                          'score' : score})
print(p)

In [None]:
index_df

In [None]:
documents[2]

In [None]:
documents[13278]