# Exploring Terms in the Encyclopaedia Britannica

## Similar articles within an edition - Gensim - Doc2Vec


### Loading the necessary libraries

In [1]:
import yaml
import matplotlib.pyplot as plt
import numpy as np
import collections
import matplotlib as mpl

In [2]:
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
import pandas as pd
from yaml import safe_load
from pandas.io.json import json_normalize

In [4]:
import gensim
from gensim.models.doc2vec import Doc2Vec

In [5]:
from postprocess_scripts.doc2vec_prep import stem_text, clean_text, generate_documents_df

In [6]:
from tqdm import tqdm
import os


#### Hyperparameters

In [7]:
# Init the Doc2Vec model
hyperparams  = {
    'dm': 1,
    'vector_size': 300,
    'window': 5,
    'alpha': 0.025,
    'min_alpha': 0.00025,
    'min_count': 2,
    'workers': 8
}

### Functions

In [8]:
def get_document(df, index):
    term = df.loc[index]["term"]
    definition = df.loc[index]["definition"]
    return term, definition

In [9]:
def most_similar(model, text, clean_func=clean_text, topn=None):
    vector = model.infer_vector(clean_func(text), epochs=100, alpha=model.alpha, min_alpha=model.min_alpha)
    simdocs = model.docvecs.most_similar(positive=[vector], topn=topn)
    return simdocs

In [10]:
def load_model(filename):
    try:
        return Doc2Vec.load(os.path.join(MODEL_PATH, filename), mmap='r')
    except:
        return None


## We have dataframe with these information

- definition:           Definition of a term
- editionNum:           1,2,3,4,5,6,7,8
- editionTitle:         Title of the edition
- header:               Header of the page's term                                  
- place:                Place where the volume was edited (e.g. Edinburgh)                                    
- relatedTerms:         Related terms (see X article)  
- altoXML:              File Path of the XML file from which the term belongs       
- term:                 Term name                            
- positionPage:         Position of ther term in the page     
- startsAt:             Number page in which the term definition starts 
- endsAt:               Number page in which the term definition ends 
- volumeTitle:          Title of the Volume
- typeTerm:             Type of term [Topic| Articles]                                       
- year:                 Year of the edition
- volumeNum:            Volume number (e.g. 1)
- letters:              leters of the volume (A-B)
- part:                 Part of the volume (e.g 1)
- supplement:           Supplement's Title
- supplementsTo:        It suppelements to editions [1, 2, 3....]
- numberOfWords:        Number of words per term definition
- numberOfTerms:        Number of terms per page
- numberOfPages:        Number of pages per volume

### 1. Load dataframe from JSON file

In [11]:
df = pd.read_json('./results_NLS/tmp2/results_eb_1_edition_dataframe', orient="index") 

In [12]:
df = df[["term", "definition", "relatedTerms", "header", "startsAt", "endsAt", "numberOfTerms","numberOfWords", "numberOfPages", \
             "positionPage", "typeTerm", "editionTitle", "editionNum", "supplementTitle", "supplementsTo",\
             "year", "place", "volumeTitle", "volumeNum", "letters", "part", "altoXML"]]
df

Unnamed: 0,term,definition,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,numberOfPages,positionPage,...,editionNum,supplementTitle,supplementsTo,year,place,volumeTitle,volumeNum,letters,part,altoXML
10,AADE,"the name of two rivers, one in the country of ...",[],EncyclopaediaBritannica,15,15,22,19,832,3,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
100,ABETTOR,"a law-term, implying one who encourages anothe...",[],ABE,18,18,16,55,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082943.34.xml
1000,ALBY,"or Alb 1, a city of France in the province of ...",[],ALBALC,106,106,31,20,832,22,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188084090.34.xml
10000,INYBURG,"a town of Denmark, situated at the eafiend of ...",[],NYBNYS,473,473,14,25,872,2,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810223.34.xml
10001,NYCHTHEMERON,"the natural day, or day and night, which toget...",[],NYBNYS,473,473,14,13,872,3,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810223.34.xml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,NUT,"among botaniils, denotes a pericarpiurn of an ...",[],NUTNUT,472,472,12,14,872,8,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml
9996,NUTATION,"in aflronomy, a kind of tremulous motion of th...",[],NUTNUT,472,472,12,33,872,9,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml
9997,NUTMEG,"the kernel of a large fruit, not unlike the Th...","[MACE, PEEMED, DUTCH, THELARGEFT, EAP-INDIES, ...",NUTNUT,472,472,12,451,872,10,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml
9998,NUTRITION,"in the animal ceconomy, is the repairing the c...",[PISTACHIA],NUTNUT,472,473,12,486,872,11,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml


### 2. Similar Terms

### 2.1 Selecting just the volumes of 1771

In [13]:
df_1771= df[(df['year'] == 1771)]
df_1771_small = df_1771.head(100)


### 2.2 Counting the number of terms

**Remember**: A term can appear in more than once  per eddition. 

In [14]:
len(df_1771_small)

100

In [15]:
train_documents = list(tqdm(generate_documents_df(df_1771_small, clean_text, min_words=5)))

7it [00:00, 65.90it/s]

Preprocessing function: clean_text
Minimum document length: 5 words


91it [00:01, 67.49it/s]

Generated 91 description terms





In [16]:
print(f'Created {len(train_documents)} tagged documents.')
model = Doc2Vec(**hyperparams)
print('Build vocabulary')
model.build_vocab(train_documents)
for epoch in range(100):
    print(f'Train model: epoch={epoch}')
    model.train(train_documents, total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

Created 91 tagged documents.
Build vocabulary
Train model: epoch=0
Train model: epoch=1
Train model: epoch=2
Train model: epoch=3
Train model: epoch=4
Train model: epoch=5
Train model: epoch=6
Train model: epoch=7
Train model: epoch=8
Train model: epoch=9
Train model: epoch=10
Train model: epoch=11
Train model: epoch=12
Train model: epoch=13
Train model: epoch=14
Train model: epoch=15
Train model: epoch=16
Train model: epoch=17
Train model: epoch=18
Train model: epoch=19
Train model: epoch=20
Train model: epoch=21
Train model: epoch=22
Train model: epoch=23
Train model: epoch=24
Train model: epoch=25
Train model: epoch=26
Train model: epoch=27
Train model: epoch=28
Train model: epoch=29
Train model: epoch=30
Train model: epoch=31
Train model: epoch=32
Train model: epoch=33
Train model: epoch=34
Train model: epoch=35
Train model: epoch=36
Train model: epoch=37
Train model: epoch=38
Train model: epoch=39
Train model: epoch=40
Train model: epoch=41
Train model: epoch=42
Train model: epoch

In [17]:
# Save the model
model_path = os.path.join("./results_NLS/", 'doc2vec_df_1771.model')
model.save(model_path)
print(f'Saved model to {model_path}')

Saved model to ./results_NLS/doc2vec_df_1771.model


In [18]:
term = text =df_1771_small.loc[100]["term"]
term

'ABETTOR'

In [19]:
text =df_1771_small.loc[100]["definition"]
text

"a law-term, implying one who encourages another to the performance of some criminal adlion, or who is. art and part in the performance itself. Treason is the only crime in which abettors are excluded by law, every individual concerned being considered as a principal. It is the same with art and part in'the Scots law."

In [20]:
text="in law, a person who becomes equally guilty in the crime of another by knowingly and voluntarily aiding the criminal during the act itself. An abettor is one kind of accomplice (q.v.), the other being an accessory, who aids the criminal prior to or after the crime"

In [21]:
text

'in law, a person who becomes equally guilty in the crime of another by knowingly and voluntarily aiding the criminal during the act itself. An abettor is one kind of accomplice (q.v.), the other being an accessory, who aids the criminal prior to or after the crime'

In [22]:
#model=load_model('./results_NLS/doc2vec_df_1771.model')
cleaned_text = clean_text(text)
# Just going to take the firs 10 -- so topn=10
simdocs=most_similar(model, text, topn=10)

In [23]:
print("#### TEST 1 -- Doc2Vec -- Printing the details of the 10 most similar documents using Doc2Vec ")
for doc_id , rank in simdocs:
    term, definition = get_document(df_1771_small, doc_id)
    print("!! Using DocVec --- Document_id: %s - Rank %s - Details: Term %s, Definition: %s" %(doc_id, rank, term, definition))
    print("---")

#### TEST 1 -- Doc2Vec -- Printing the details of the 10 most similar documents using Doc2Vec 
!! Using DocVec --- Document_id: 100 - Rank 0.994238018989563 - Details: Term ABETTOR, Definition: a law-term, implying one who encourages another to the performance of some criminal adlion, or who is. art and part in the performance itself. Treason is the only crime in which abettors are excluded by law, every individual concerned being considered as a principal. It is the same with art and part in'the Scots law.
---
!! Using DocVec --- Document_id: 10073 - Rank 0.9908771514892578 - Details: Term OECONOMICS, Definition: the art of managing the affairs of a family, or community; and hence the person who takes care of the revenues and other affairs of churches, monaderies, and the like, is termed oeconomus.
---
!! Using DocVec --- Document_id: 10020 - Rank 0.9891510009765625 - Details: Term OBLIGATION, Definition: in Scots law. See Law Tit. xx.
---
!! Using DocVec --- Document_id: 1004 - Rank 