# Exploring Terms in the Encyclopaedia Britannica

## Similar terms within an edition - BERT - Transformers


https://theaidigest.in/how-to-do-semantic-document-similarity-using-bert/

### Loading the necessary libraries

In [1]:
import yaml
import matplotlib.pyplot as plt
import numpy as np
import collections
import matplotlib as mpl

In [2]:
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
import os

In [4]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

### Functions

In [5]:
def get_document(uri):
    uri="<"+uri+">"
    sparql = SPARQLWrapper("http://localhost:3030/edition1st/sparql")
    query="""
    PREFIX eb: <https://w3id.org/eb#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?definition ?term
        WHERE {{
            %s a eb:Article ;
               eb:name ?term ;
               eb:definition ?definition . 
            }
            UNION {
            %s a eb:Topic ;
              eb:name ?term ; 
              eb:definition ?definition . 
            }
       } 
    """ %(uri, uri)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    term = results["results"]["bindings"][0]["term"]["value"]
    definition=results["results"]["bindings"][0]["definition"]["value"]
    return term, definition

In [6]:
def get_topic_name(index_uri, topics, topic_model):
    topic_num=topics[index_uri]
    topic_info=topic_model.get_topic(topic_num)
    topic_name=""
    #lets get the first 4 elements
    cont = 0
    for i in topic_info:
        topic_name=topic_name+"_"+i[0]
        cont=cont+1
        if cont == 4:
            break
    topic_name=str(topic_num)+topic_name
    return topic_name

In [7]:
import rdflib
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx
import matplotlib.pyplot as pl
from rdflib import Graph, ConjunctiveGraph, Namespace, Literal
from rdflib.plugins.sparql import prepareQuery

In [21]:
import networkx as nx
import matplotlib.pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

### 2.  Selecting just the 100 first elements of  the first volume of 1771

In [8]:
import pickle
with open ('documents_1ed.txt', 'rb') as fp:
    documents = pickle.load(fp)
with open('terms_info_1ed.txt', 'rb') as fp2:
    terms_info = pickle.load(fp2)
with open('uris_1ed.txt', 'rb') as fp3:
    uris = pickle.load(fp3)

In [9]:
embeddings = np.load('embeddings_1ed.npy')

In [10]:
from bertopic import BERTopic
#sentence_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
#topic_model_1 = BERTopic(language="english")
#topic_model_2 = BERTopic(language="english", top_n_words=10, embedding_model=sentence_model)
#topics, probs = topic_model_1/2.fit_transform(documents)


In [11]:
topic_model = BERTopic().fit(documents, embeddings)


In [12]:
topics, probs = topic_model.transform(documents, embeddings)

In [13]:
len(topics)

18117

In [14]:
len(documents)

18117

In [15]:
for i in range(0, 10):
    print(topics[i], documents[i], uris[i])
    print(topic_model.get_topic(topics[i])[:10])
    print("---")

9 a town of Scotland, in the county of Renfrew, six miles west of Glasgow. https://w3id.org/eb/i/Article/9929192893804340_144850368_PAISLEY_0
[('scotland', 0.14774680007491345), ('county', 0.0671034002642839), ('frith', 0.043279608603484176), ('town', 0.03737127083726744), ('parliamenttown', 0.035789270657503404), ('miles', 0.03553824686143193), ('clyde', 0.02982439221458617), ('edinburgh', 0.0288472611255397), ('north', 0.02753062914563609), ('situated', 0.02617670537970505)]
---
3 the superior of a convent of monks,.or the next under the abbot. See Abbot. PRISCILLIANISTS, in church-history, Christian heretics, so called from their leader Prifcillian, a Spaniard by birth, and bilhop of Avila. He is said to have praftifed magic, and to have maintained the principal errors of the Manichees ; but his peculiar tenet was, that it is lawful to make false oaths, in order to support one’s cause and https://w3id.org/eb/i/Article/9929192893804340_144850368_PRIOR_0
[('church', 0.0237664933351384

In [16]:
topic_model.get_topic(0)

[('class', 0.02212807875511612),
 ('genus', 0.021441929876905172),
 ('botany', 0.02027541069503763),
 ('calix', 0.019935491914508805),
 ('species', 0.01899563290722906),
 ('corolla', 0.0182167511146065),
 ('natives', 0.016129827483859212),
 ('britain', 0.014189982004758765),
 ('five', 0.011489189349029606),
 ('petals', 0.011115621848977436)]

In [17]:
info_df=topic_model.get_topic_info()
info_df

Unnamed: 0,Topic,Count,Name
0,-1,6024,-1_his_our_earth_time
1,0,1316,0_class_genus_botany_calix
2,1,783,1_latitude_equation_equal_line
3,2,606,2_head_child_uterus_pelvis
4,3,383,3_church_who_churchhistory_god
...,...,...,...
219,218,10,218_ten_market_southwest_cockermouth
220,219,10,219_os_bone_muscles_bones
221,220,10,220_antiquity_chlamys_children_jewels
222,221,10,221_illegal_deed_bootless_emendation


In [18]:
import pandas as pd

In [19]:
topics_docs = pd.DataFrame (topics, columns = ['topic_num'])

In [20]:
topics[0]

9

In [21]:
topics_docs

Unnamed: 0,topic_num
0,9
1,3
2,3
3,-1
4,2
...,...
18112,19
18113,1
18114,-1
18115,-1


In [22]:
t=topics_docs.groupby("topic_num")["topic_num"].count()
t

topic_num
-1      6024
 0      1316
 1       783
 2       606
 3       383
        ... 
 218      10
 219      10
 220      10
 221      10
 222      10
Name: topic_num, Length: 224, dtype: int64

In [23]:
t.loc[64]

44

In [24]:
docs_topic_9 = topics_docs[topics_docs['topic_num'] == 9]
docs_topic_9

Unnamed: 0,topic_num
0,9
61,9
155,9
400,9
618,9
...,...
17290,9
17336,9
17772,9
17894,9


In [280]:
print(documents[33], uris[3])

a port-town of Granada, in Spain, situated upon the Mediterranean: W. Ipng. 3 0 45'. N. lat. 36° 40'. https://w3id.org/eb/i/Article/992277653804341_144133903_ODYSSEY_0


In [203]:
print(documents[720], uris[720])

a parliament-town of Scotland, in the isle of Bute: W. long. 5°, and N. lat. 55° 5c/. https://w3id.org/eb/i/Article/9929192893804340_144850368_ROTHSAY_0


In [147]:
#topics, similarity = topic_model.find_topics("sports", top_n=5)

In [25]:
topic_model.visualize_topics()



In [263]:
topic_model.visualize_barchart()

In [149]:
topic_model.visualize_heatmap()

In [264]:
# Save model
#topic_model.save("BerTopic_Model")

In [266]:
# Save topics and dataframes
#import pickle
#with open('topics.txt', 'wb') as fp:
#    pickle.dump(topics, fp)

In [289]:
topics[0]

10

In [297]:
t_names=[]
topics_names=[]
for i in range(0, len(topics)):
    topic_name=get_topic_name(i, topics, topic_model)
    if topic_name not in t_names:
        t_names.append(topic_name)
    topics_names.append(topic_name)

In [299]:
len(topics_names)

18117

In [300]:
topics_names[0]

'10_scotland_county_frith_town'

In [301]:
#with open('t_names.txt', 'wb') as fp:
#    pickle.dump(t_names, fp)

#with open('topics_names.txt', 'wb') as fp:
#    pickle.dump(topics_names, fp)

In [152]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,6266
1,0,1291
2,1,313
3,2,292
4,3,250
...,...,...
223,229,11
222,226,11
231,230,10
232,231,10


In [158]:
topic_model.get_topics()

{-1: [('side', 0.001794927733949683),
  ('where', 0.0017610711272301027),
  ('time', 0.0017302123687343873),
  ('great', 0.0017160001899043083),
  ('small', 0.0017048844118591052),
  ('os', 0.001695534365166071),
  ('after', 0.0016873669624766004),
  ('parts', 0.0016767682372148388),
  ('part', 0.001676709877772037),
  ('each', 0.0016732798612058382)],
 0: [('class', 0.022010753470246686),
  ('genus', 0.021283691018210055),
  ('botany', 0.019992801188237874),
  ('calix', 0.019945897816139938),
  ('species', 0.018761463329935703),
  ('corolla', 0.018182147168159524),
  ('natives', 0.01618337779416348),
  ('britain', 0.014180741624457845),
  ('five', 0.0114079938336644),
  ('petals', 0.01108144882360946)],
 1: [('italy', 0.08752262814250823),
  ('naples', 0.055627615740758855),
  ('venice', 0.03817390207431729),
  ('lat', 0.03664865478837265),
  ('city', 0.035438796488271376),
  ('miles', 0.03422512543581712),
  ('town', 0.03394739682807627),
  ('situated', 0.03160708139167414),
  ('king

In [275]:
a=topic_model.get_topic(10)

In [277]:
b=""
for i in a:
    b=b+"_"+i[0]
b

'_scotland_county_frith_town_miles_parliamenttown_edinburgh_clyde_north_situated'

In [211]:
topic_model.get_params()

{'calculate_probabilities': False,
 'embedding_model': None,
 'hdbscan_model': HDBSCAN(min_cluster_size=10, prediction_data=True),
 'language': 'english',
 'low_memory': False,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(),
 'verbose': False}

In [302]:
topic_model.visualize_barchart()