# Exploring Terms in the Encyclopaedia Britannica

## Topic Modelling - Gensim - LDA

In this notebook we are going to peform Topic Modelling using the terms within the dataframe that we have obtained either with the posprocess_eb.py script or Merging_EB_Terms.ipynb notebooks. Both methods obtain the same dataframe. 

We have selected the first Edition for this explorations, but we can run this notebook with any of the other editions.

**Remark**: Edition 1, has 3 volumes, and it was printed twice, in 1771 and 1773. 

These are the explorations that we are going to do:




### Loading the necessary libraries

In [30]:
import yaml
import matplotlib.pyplot as plt
import numpy as np
import collections
import matplotlib as mpl

In [55]:
import networkx as nx
import matplotlib.pyplot as plt
import re

In [32]:
import pandas as pd
from yaml import safe_load
from pandas.io.json import json_normalize

In [33]:
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel, LdaMulticore

import spacy
#import textacy.keyterms
from pprint import pprint


In [34]:
from doc2vec_prep import stem_text, clean_text, generate_documents_df

In [35]:
from tqdm import tqdm
import os


#### Hyperparameters

In [36]:
# Init the Doc2Vec model
hyperparams  = {
    'dm': 1,
    'vector_size': 300,
    'window': 5,
    'alpha': 0.025,
    'min_alpha': 0.00025,
    'min_count': 2,
    'workers': 8
}

### Functions

In [37]:
def get_document(df, index):
    print("INDEX IS %s" %index)
    term = df.loc[index]["term"]
    definition = df.loc[index]["definition"]
    return term, definition

In [38]:
def stem_text(text):
    p = PorterStemmer()
    text = re.sub(r'\S*@\S*\s?', '', text, flags=re.MULTILINE) # remove email
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE) # remove web addresses
    text = re.sub("\'", "", text) # remove single quotes
    text = remove_stopwords(text)
    text = p.stem_sentence(text)
    return simple_preprocess(text, deacc=True)

In [39]:
def clean_text(corpus):
    # --- remove if not alphanumeric:
    corpus = re.sub('[\W_]+', ' ', corpus)
    # --- replace numbers with #
    corpus = re.sub(r'\b\d+\b', '#', corpus)
    # --- remove new line character
    corpus = re.sub('\n', ' ', corpus)
    # --- remove words containing numbers
    corpus = re.sub('\w*\d\w*', '', corpus)
    # --- remove one-letter words in square brackets
    corpus = re.sub(r"\b[a-zA-Z]\b", '', corpus)
    # --- remove words with one characters
    corpus = re.sub(r"\b\w{1}\b", '', corpus)
    # --- remove multiple spaces in string
    corpus = re.sub(' +', ' ', corpus)
    # --- make lowercase
    corpus = corpus.lower()
    corpus = nlp(corpus)

    all_texts = []
    for sentence in list(corpus.sents):
        # --- lemmatization, remove punctuation
        txt = [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]
        all_texts.append(txt)
    return [val for sublist in all_texts for val in sublist]

In [40]:
def norm_lemmatizer(doc):
    # This takes in a doc of tokens from the NER, filter out NUM and Sym, and lemmatizes them.
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token for token in doc if token.pos_ != "NUM" and token.pos_ != "SYM"]
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

In [41]:
def train_lda_model(corpus, dictionary, n_topics):
    chunksize = 2
    passes = 20
    iterations = 400
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=n_topics,
                         chunksize=chunksize,
                         alpha='auto',
                         eta='auto',
                         iterations=iterations,
                         passes=passes)

    top_topics = lda_model.top_topics(corpus) #, num_words=20)
    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / n_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    # save the model
    lda_model.save('lda_model.model')

    # See the topics
    print("LDA_Seq - top topics:")
    pprint(top_topics)
    return lda_model

In [42]:
def train_ldaMulticore_mode(corpus, dictionary, n_topics):
    chunksize = 2
    passes = 20

    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             random_state=100,
                             num_topics=n_topics,
                             passes=passes,
                             chunksize=chunksize,
                             batch=False,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)

    # save the model
    lda_model.save('ldaMulti_model.model')
    top_topics = lda_model.top_topics(corpus) #, num_words=20)
    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / n_topics
    print('Average LDA Multicore topic coherence: %.4f.' % avg_topic_coherence)

    print("LDA_Multicore - top topics:")
    pprint(top_topics)
    #print(lda_model.print_topics(-1))
    return lda_model

In [43]:
def interpret_LDA(lda_model, corpus):

    # The topic(s) that document belongs to along with percentage.
    # The topic(s) each word in that document belongs to.
    # The topic(s) each word in that document belongs to AND the phi values.
    # Phi value is the probability of the word belonging to that particular topic.
    #And the sum of phi values for a given word adds up to the number of times that word occurred in that document.

    for c in lda_model[corpus[:]]:
        print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
        print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
        print("Phi Values (word id) : ", c[2][:3])  # [(Word id, [(Topic, Phi Value)])]
        print("Word, Topics         : ", [(dictionary[wd], topic) for wd, topic in c[1][:3]])   # [(Word, [Topics])]
        print("Phi Values (word)    : ", [(dictionary[wd], topic) for wd, topic in c[2][:3]])  # [(Word, [(Topic, Phi Value)])]
        print("------------------------------------------------------\n")


## We have dataframe with these columns

- definition:           Definition of a term
- editionNum:           1,2,3,4,5,6,7,8
- editionTitle:         Title of the edition
- header:               Header of the page's term                                  
- place:                Place where the volume was edited (e.g. Edinburgh)                                    
- relatedTerms:         Related terms (see X article)  
- altoXML:              File Path of the XML file from which the term belongs       
- term:                 Term name                            
- positionPage:         Position of ther term in the page     
- startsAt:             Number page in which the term definition starts 
- endsAt:               Number page in which the term definition ends 
- volumeTitle:          Title of the Volume
- typeTerm:             Type of term [Topic| Articles]                                       
- year:                 Year of the edition
- volumeNum:            Volume number (e.g. 1)
- letters:              leters of the volume (A-B)
- part:                 Part of the volume (e.g 1)
- supplement:           Supplement's Title
- supplementsTo:        It suppelements to editions [1, 2, 3....]
- numberOfWords:        Number of words per term definition
- numberOfTerms:        Number of terms per page
- numberOfPages:        Number of pages per volume

### 1. Load dataframe from JSON file

In [44]:
df = pd.read_json('../../results_NLS/results_eb_1_edition_dataframe', orient="index") 

Now we are going to oder the columns of our dataframe and visualise it. 

In [45]:
df = df[["term", "definition", "relatedTerms", "header", "startsAt", "endsAt", "numberOfTerms","numberOfWords", "numberOfPages", \
             "positionPage", "typeTerm", "editionTitle", "editionNum", "supplementTitle", "supplementsTo",\
             "year", "place", "volumeTitle", "volumeNum", "letters", "part", "altoXML"]].reset_index(drop=True)

df


Unnamed: 0,term,definition,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,numberOfPages,positionPage,...,editionNum,supplementTitle,supplementsTo,year,place,volumeTitle,volumeNum,letters,part,altoXML
0,OR,"A NEW A D I C T I A A, the name of several riv...",[],EncyclopaediaBritannica,15,15,22,54,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
1,AABAM,"a term, among alchemifts, for lead,",[],EncyclopaediaBritannica,15,15,22,6,832,1,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
2,AACH,the name of a town and river in Swabia. It is ...,[],EncyclopaediaBritannica,15,15,22,17,832,2,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
3,AADE,"the name of two rivers, one in the country of ...",[],EncyclopaediaBritannica,15,15,22,19,832,3,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
4,AAHUS,a small town and diftrift in Weftphalia.,[],EncyclopaediaBritannica,15,15,22,7,832,4,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27422,ZUYDERSEE,"a great bay of the German ocean, which lies in...",[],ZoDZYG,857,857,27,66,864,22,...,1,,[],1773,London,"Encyclopaedia Britannica: or, A dictionary of ...",3,M-Z,0,144850368/alto/188375020.34.xml
27423,ZWEIBRUGGEN,"a county of the palatinate of the Rhine, in Ge...",[SQALVS],ZoDZYG,857,857,27,23,864,23,...,1,,[],1773,London,"Encyclopaedia Britannica: or, A dictionary of ...",3,M-Z,0,144850368/alto/188375020.34.xml
27424,ZYGOMA,in anatomy. See Anatomy p. 152.,[],ZoDZYG,857,857,27,6,864,24,...,1,,[],1773,London,"Encyclopaedia Britannica: or, A dictionary of ...",3,M-Z,0,144850368/alto/188375020.34.xml
27425,ZYGOMATICUS,"in anatomy,. See Anatomy, p. 306,",[ANATOMY],ZoDZYG,857,857,27,6,864,25,...,1,,[],1773,London,"Encyclopaedia Britannica: or, A dictionary of ...",3,M-Z,0,144850368/alto/188375020.34.xml


### 2.  Selecting just the 100 first elements of  the first volume of 1771

In [46]:
df_1771_small = df[(df['year'] == 1771) & (df['volumeNum'] == 1) ]
df_1771_small = df_1771_small.head(100).reset_index(drop=True)


In [47]:
df_1771_small

Unnamed: 0,term,definition,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,numberOfPages,positionPage,...,editionNum,supplementTitle,supplementsTo,year,place,volumeTitle,volumeNum,letters,part,altoXML
0,OR,"A NEW A D I C T I A A, the name of several riv...",[],EncyclopaediaBritannica,15,15,22,54,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
1,AABAM,"a term, among alchemifts, for lead,",[],EncyclopaediaBritannica,15,15,22,6,832,1,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
2,AACH,the name of a town and river in Swabia. It is ...,[],EncyclopaediaBritannica,15,15,22,17,832,2,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
3,AADE,"the name of two rivers, one in the country of ...",[],EncyclopaediaBritannica,15,15,22,19,832,3,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
4,AAHUS,a small town and diftrift in Weftphalia.,[],EncyclopaediaBritannica,15,15,22,7,832,4,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ABEYANCE,"in law, the expedtancy of an edate. Thus if la...",[],ABE,18,18,16,36,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082943.34.xml
96,ABHEL,"in botany, an obsolete name of the fabina or s...",[],ABE,18,18,16,14,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082943.34.xml
97,ABIES,"the sir-tree, in botany, belongs to the monaec...",[],ABE,18,18,16,16,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082943.34.xml
98,PINUS,"of which it is a (pecies,",[],ABE,18,18,16,6,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082943.34.xml


### 2.1 Counting the number of terms

**Remember**: A term can appear in more than once  per eddition. 

In [48]:
len(df_1771_small)

100

In [57]:
def generate_terms(df, clean_func):
    doc_text=[]
    for index, row in df.iterrows():
        text = row['term'] + row['definition']
        words = clean_func(text)
        doc_text.append(words)
        
        
    return doc_text

### 3. LDA



In [52]:
nlp = spacy.load("en_core_web_sm")

In [53]:
# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(norm_lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <function norm_lemmatizer at 0x1aaf6dae8> (name: 'lemmatizer').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

In [58]:
doc_text = generate_terms(df_1771_small, clean_text)
doc_list=[]
for text in doc_text:
    pr = nlp(text)
    doc_list.append(pr)

for text in text_urls:
    print("Keyterms:")
    doc=textacy.make_spacy_doc(text, lang='en_core_web_sm')
    keyterms=textacy.keyterms.textrank(doc)
    print(keyterms)
    print("------")

TypeError: Argument 'string' has incorrect type (expected str, got list)

#### 3.2 Creating the model

Once we have created our trained corpus, we are going to create our model. In this step we are going to create a document embedding per element in our training corpus. 

In [None]:
for text in text_urls:
      print("Keyterms:")
      doc=textacy.make_spacy_doc(text, lang='en_core_web_sm')
      keyterms=textacy.keyterms.textrank(doc)
      print(keyterms)
      print("------")

#print(doc_list[0])
# Creates, which is a mapping of word IDs to words.
dictionary = corpora.Dictionary(doc_list)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
#dictionary.filter_extremes(no_below=2, no_above=0.5)
dictionary.filter_extremes(no_above=0.7)

# Turns each document into a bag of words.
corpus = [dictionary.doc2bow(doc) for doc in doc_list]

print('Number of unique tokens: {}'.format(len(dictionary)))
print('Number of documents: {}'.format(len(corpus)))

# Train and run LDA model.
lda_model= train_lda_model(corpus, dictionary, 7)
#lda_model = train_ldaMulticore_mode(corpus, dictionary, 7)
interpret_LDA(lda_model, corpus)