In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
from sklearn.datasets import fetch_20newsgroups

import pickle
import bz2
import time




In [2]:
#Bug fix for pyLDAvis https://github.com/bmabey/pyLDAvis/issues/162
from IPython.display import HTML
css_str = '<style> \
.jp-Button path { fill: #616161;} \
text.terms { fill: #616161;} \
.jp-icon-warn0 path {fill: var(--jp-warn-color0);} \
.bp3-button-text path { fill: var(--jp-inverse-layout-color3);} \
.jp-icon-brand0 path { fill: var(--jp-brand-color0);} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str ))

# Load datapickle 
From wikidownloader.py

In [3]:
#python -m spacy download en_core_web_sm
#python -m spacy download en_core_web_md
import nltk;
nltk.download('stopwords')
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
print('Load data pickle')
with bz2.BZ2File('datapickle.bz2', 'rb') as f:  #Use datacompression BZ2
    data= pickle.load(f)
print('Loaded data pickle')
    
df=pd.DataFrame({'text':data[0],'title':data[1]})

Load data pickle
Loaded data pickle


In [13]:
df

Unnamed: 0,text,title
0,'''Colonel Patrick Mackellar''' (1717–1778)...,Patrick_MacKellar
1,'''Putative ATP-dependent RNA helicase DHX57'...,DHX57
2,'''Abram Wesley Eager''' (1864 ndash;1930) was...,Abraham_Wesley_Eager
3,'''Babiuk''' is a surname. Notable people with...,Babiuk
4,}} '''Surabaya–Gempol Toll Road''' is a to...,Surabaya%E2%80%93Gempol_Toll_Road
...,...,...
49999,}.svg ---&gt; |image_coat = Wappe...,March_of_Lusatia
50000,"In association football, rugby league, rugby u...",Dummy_(football)
50001,| status = complete | start_date = 1979 | co...,Shinjuku_NS_Building
50002,The '''Wesleyan Methodist Church''' (also nam...,Wesleyan_Methodist_Church_(Great_Britain)


# Tokenizing articles


In [15]:
def sent_to_words(sentences):
   for sentence in sentences:
      yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data_words = list(sent_to_words(df.text))

In [21]:
print(df.text[0][:400])

   '''Colonel Patrick Mackellar''' (1717–1778) was a British army officer and military engineer who played a significant role in the early history of North America.  He was the deputy chief engineer at the Siege of Louisbourg (1758) and the chief engineer at the siege of Quebec in 1759.  In later years he was responsible for the design and construction of the town of Es Castell on the island of Me


In [19]:
print(data_words[0][:40])

['colonel', 'patrick', 'mackellar', 'was', 'british', 'army', 'officer', 'and', 'military', 'engineer', 'who', 'played', 'significant', 'role', 'in', 'the', 'early', 'history', 'of', 'north', 'america', 'he', 'was', 'the', 'deputy', 'chief', 'engineer', 'at', 'the', 'siege', 'of', 'louisbourg', 'and', 'the', 'chief', 'engineer', 'at', 'the', 'siege', 'of']


# Removing stop words and adding bigrams

In [23]:

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
def remove_stopwords(texts):
   return [[word for word in simple_preprocess(str(doc)) 
   if word not in stop_words] for doc in texts]
def make_bigrams(texts):
   return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
   [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
   texts_out = []
   for sent in texts:
      doc = nlp(" ".join(sent))
      texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
   return texts_out
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)


In [24]:
print(data_words_nostops[0][:40])

['colonel', 'patrick', 'mackellar', 'british', 'army', 'officer', 'military', 'engineer', 'played', 'significant', 'role', 'early', 'history', 'north', 'america', 'deputy', 'chief', 'engineer', 'siege', 'louisbourg', 'chief', 'engineer', 'siege', 'quebec', 'later', 'years', 'responsible', 'design', 'construction', 'town', 'es', 'castell', 'island', 'menorca', 'early', 'life', 'career', 'patrick', 'mackellar', 'born']


In [25]:
print(data_words_bigrams[0][:40])

['colonel', 'patrick', 'mackellar', 'british', 'army', 'officer', 'military', 'engineer', 'played', 'significant', 'role', 'early', 'history', 'north_america', 'deputy', 'chief', 'engineer', 'siege', 'louisbourg', 'chief', 'engineer', 'siege', 'quebec', 'later', 'years', 'responsible', 'design', 'construction', 'town', 'es', 'castell', 'island', 'menorca', 'early', 'life', 'career', 'patrick', 'mackellar', 'born', 'son']


# Lemmatization

Reference: https://www.researchgate.net/publication/341574872_Machine_Learning_and_Deep_Neural_Network-Based_Lemmatization_and_Morphosyntactic_Tagging_for_Serbian


The basic set of PoS-categories/tags that should be as-signed to tokens is not generally accepted, even for a spe-ciﬁc language. The choice of a tagset usually depends onthe foreseen task or project. A tagset tailored to be ap-plicable for PoS-tagging in general is the Universal Part-of-Speech (UPoS) tagset (Petrov et al., 2012) (used byspaCy), and it lists the following 17 categories: adjective(ADJ), adposition (ADP), adverb (ADV), auxiliary (AUX),coordinating conjunction (CCONJ), determiner (DET), in-terjection (INTJ), noun (N), numerical (NUM), particle(PART), pronoun (PRON), proper noun (PROPN), punctu-ation (PUNCT), subordinating conjunction (SCONJ), sym-bol (SYM), verb (VERB) and other (X). It should be notedthat the MULTEXT-East tagset (Erjavec, 2012) was alsotailored to be universal. SMD uses its own tagset thatcorresponds closely to Serbian traditional grammars. TheSerbian TreeTagger models TT11 and TT19 (see Subsec-tion 3.3.) use modiﬁcations of the SMD tagset. A gen-eral overview of the tagsets used is presented in Table 3.It should be noted that tags for some PoS differ betweentagsets (e.g. ADJ in UPoS vs. A in SMD for adjective

In [26]:
doc=nlp(" ".join(data_words_bigrams[0]))

In [28]:
tags=[]
for w in doc:
    if not w.pos_ in tags:
        print('Lemmatization Example {} -> {}'.format(w.lemma_,w.pos_))
        tags.append(w.pos_)
tags

Lemmatization Example colonel -> PROPN
Lemmatization Example officer -> NOUN
Lemmatization Example military -> ADJ
Lemmatization Example play -> VERB
Lemmatization Example later -> ADV
Lemmatization Example maam -> INTJ
Lemmatization Example four -> NUM
Lemmatization Example along -> ADP
Lemmatization Example may -> AUX
Lemmatization Example upon -> SCONJ
Lemmatization Example th -> X
Lemmatization Example another -> DET


['PROPN',
 'NOUN',
 'ADJ',
 'VERB',
 'ADV',
 'INTJ',
 'NUM',
 'ADP',
 'AUX',
 'SCONJ',
 'X',
 'DET']

We will only keep the UPoS with the following tags 'NOUN', 'ADJ', 'VERB', 'ADV'

In [30]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=[
   'NOUN', 'ADJ', 'VERB', 'ADV'
])
print(data_lemmatized[0][:40]) #it will print the lemmatized data.


['british', 'officer', 'military', 'engineer', 'play', 'significant', 'role', 'early', 'history', 'deputy', 'chief', 'engineer', 'siege', 'chief', 'engineer', 'siege', 'quebec', 'later', 'year', 'responsible', 'design', 'construction', 'town', 'menorca', 'early', 'life', 'career', 'bear', 'son', 'tenant', 'maam', 'argyllshire', 'probably', 'influence', 'second', 'duke', 'argyll', 'enter', 'ordnance', 'service']


In [33]:
print(data_words_bigrams[0][:55])

['colonel', 'patrick', 'mackellar', 'british', 'army', 'officer', 'military', 'engineer', 'played', 'significant', 'role', 'early', 'history', 'north_america', 'deputy', 'chief', 'engineer', 'siege', 'louisbourg', 'chief', 'engineer', 'siege', 'quebec', 'later', 'years', 'responsible', 'design', 'construction', 'town', 'es', 'castell', 'island', 'menorca', 'early', 'life', 'career', 'patrick', 'mackellar', 'born', 'son', 'john', 'last', 'mackellar', 'head', 'tenant', 'maam', 'argyllshire', 'probably', 'influence', 'second', 'duke', 'argyll', 'entered', 'ordnance', 'service']


# Creating bag of words frequencies

Without Lemmatization

In [51]:
id2word = corpora.Dictionary(data_words_bigrams)
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[0][:20]) #it will print the corpus we created above.
[(id2word[id], freq) for id, freq in corpus[0]][:20]
#it will print the words with their frequencies.


[(0, 2), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 3), (18, 1), (19, 1)]


[('able', 2),
 ('abraham', 1),
 ('accompanied', 2),
 ('accompany', 1),
 ('accounts', 1),
 ('acting', 1),
 ('action', 1),
 ('active', 1),
 ('admiral', 1),
 ('advised', 1),
 ('advisers', 1),
 ('afternoon', 1),
 ('agent', 1),
 ('al', 1),
 ('allies', 1),
 ('along', 1),
 ('also', 2),
 ('although', 3),
 ('amara', 1),
 ('america', 1)]

With Lemmatization

In [56]:
id2word_lemmatized = corpora.Dictionary(data_lemmatized)
texts_lemmatized = data_lemmatized
corpus_lemmatized = [id2word_lemmatized.doc2bow(text) for text in texts_lemmatized]
print(corpus_lemmatized[0][:20]) #it will print the corpus we created above.
[(id2word_lemmatized[id], freq) for id, freq in corpus_lemmatized[0]][:20]
#it will print the words with their frequencies.


[(0, 2), (1, 3), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 3), (13, 4), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 5)]


[('able', 2),
 ('accompany', 3),
 ('account', 1),
 ('act', 1),
 ('action', 1),
 ('active', 1),
 ('admiral', 1),
 ('advise', 1),
 ('adviser', 1),
 ('agent', 1),
 ('ally', 1),
 ('also', 2),
 ('appear', 3),
 ('appoint', 4),
 ('appointment', 1),
 ('approach', 1),
 ('aptitude', 1),
 ('argyll', 1),
 ('argyllshire', 1),
 ('army', 5)]

# Fitting via LDA Variational Inference (Gensim) library

Without Lemmatization

In [57]:
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=corpus, id2word=id2word, num_topics=20, random_state=100, 
   update_every=1, chunksize=100, passes=5, alpha='auto', per_word_topics=True
)

In [58]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
coherence_model_lda = CoherenceModel(
   model=lda_model, texts=texts, dictionary=id2word, coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -17.878707601854718

Coherence Score:  0.5764669129992225


With Lemmatization

In [None]:
lda_model_lemmatized = gensim.models.ldamodel.LdaModel(
   corpus=corpus_lemmatized, id2word=id2word_lemmatized, num_topics=20, random_state=100, 
   update_every=1, chunksize=100, passes=5, alpha='auto', per_word_topics=True
)

In [None]:
print('\nPerplexity: ', lda_model_lemmatized.log_perplexity(corpus_lemmatized))
coherence_model_lda_lemmatized = CoherenceModel(
   model=lda_model_lemmatized, texts=texts_lemmatized, dictionary=id2word_lemmatized, coherence='c_v'
)
coherence_lda_lemmatized = coherence_model_lda_lemmatized.get_coherence()
print('\nCoherence Score: ', coherence_lda_lemmatized)

# Exploring final model (with Lemmatization)

In [None]:
pprint(lda_model_lemmatized.print_topics())

In [None]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_lemmatized, corpus_lemmatized, id2word_lemmatized)
vis