### Topic Modeling using HDP and LDA

- Text Processing
- Generating dictionary of vocabulary
- Mapping corpus using dictionary
- Training the Topic Model

In [2]:
import matplotlib.pyplot as plt
#%pip install "gensim" "spacy" "pyLDAvis" 
import gensim
import numpy as np
import spacy
import pandas as pd
import re

import nltk
nltk.download('stopwords')

from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from spacy.lang.en.stop_words import STOP_WORDS
import pyLDAvis.gensim_models
#Import nltk stopwords and add custom stopwords that are likely to appear in news articles.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["mrs","ms","say","he","mr","she","they","company"])

import os, re, operator, warnings
warnings.filterwarnings('ignore')
%matplotlib inline

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


### Text Processing
- Clean the article - Remove punctuation marks, special characters
- Tokenize each article
- Stem each token
- Remove numberical tokens

In [3]:
df=pd.read_csv("NewsArticles.csv", encoding='unicode_escape',index_col=0)
#drop all the unnamed columns
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df.head()

Unnamed: 0_level_0,publish_date,article_source_link,title,subtitle,text
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2017/2/7,http://abcnews.go.com/Politics/pence-break-tie...,"Betsy DeVos Confirmed as Education Secretary, ...",,Michigan billionaire education activist Betsy ...
2,2017/2/7,http://abcnews.go.com/Politics/wireStory/melan...,Melania Trump Says White House Could Mean Mill...,,First lady Melania Trump has said little about...
3,2017/2/7,http://abcnews.go.com/Politics/wireStory/trump...,"As Trump Fears Fraud, GOP Eliminates Election ...",,A House committee voted on Tuesday to eliminat...
4,2017/2/7,http://abcnews.go.com/Politics/appeals-court-d...,Appeals Court to Decide on Challenge to Trump'...,,"This afternoon, three federal judges from the ..."
5,2017/2/7,http://abcnews.go.com/US/23-states-winter-weat...,At Least 4 Tornadoes Reported in Southeast Lou...,,At least four tornadoes touched down in Louisi...


I'm going to use SPACY in this notebook for all the text processing related tasks. It is very powerful than NLTK. [Click here to learn more](https://spacy.io/usage/spacy-101)

In [4]:
# before loading the language you have to download it first. Go to your command prompt and execute this statement and 
# restart the kernel:
# python -m spacy download en_core_web_sm
#%pip install "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz"
import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load("en_core_web_sm")

In [5]:
data = df['text'].values.tolist()
data1 = df['title'].values.tolist()

In [6]:
from gensim import similarities
#data[0]
data1[0]



'Betsy DeVos Confirmed as Education Secretary, With Pence Casting Historic Tie-Breaking Vote'

In [7]:
#removing punctuations and others characters
def preprocess(string):
    return re.sub('[^\w_\s-]', ' ',str(string))

data = list(map(preprocess,data))    

In [8]:
#data cleaning and lemmatization
lemma_doc = []
for datum in data:
    sent = nlp(str(datum).lower())
    text = []
    for w in sent:
        if not w.is_stop and not w.is_punct and not w.like_num and str(w) not in stop_words and (len(str(w)) > 4):
            #adding the lematized version of the words
            text.append(w.lemma_)
    lemma_doc.append(text)
    

KeyboardInterrupt: 

In [9]:
lemma_doc[0]

['michigan',
 'billionaire',
 'education',
 'activist',
 'betsy',
 'devos',
 'confirm',
 'today',
 'serve',
 'secretary',
 'education',
 'president',
 'trump',
 'administration',
 'president',
 'penny',
 'break',
 'senate',
 'senate',
 'vote',
 'devos',
 'highly',
 'contentious',
 'nomination',
 'afternoon',
 'tally',
 'split',
 'evenly',
 'require',
 'penny',
 'authority',
 'president',
 'upper',
 'chamber',
 'congress',
 'break',
 'impasse',
 'president',
 'break',
 'confirm',
 'cabinet',
 'nominee',
 'pence',
 'count',
 'vote',
 'render',
 'tally',
 'democrats',
 'stage',
 'marathon',
 'speech',
 'lawmaker',
 'take',
 'floor',
 'additional',
 'republican',
 'devos',
 'block',
 'confirmation',
 'imagine',
 'bad',
 'choice',
 'elizabeth',
 'warren',
 'letter',
 'constituent',
 'urge',
 'devos',
 'stir',
 'vehement',
 'opposition',
 'teacher',
 'union',
 'senate',
 'democrat',
 'cite',
 'concern',
 'support',
 'school',
 'voucher',
 'critic',
 'believe',
 'weaken',
 'public',
 'school'

In [10]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(lemma_doc, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[lemma_doc], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[lemma_doc[0]]])

['michigan', 'billionaire', 'education', 'activist', 'betsy_devos', 'confirm', 'today', 'serve', 'secretary', 'education', 'president', 'trump', 'administration', 'president', 'penny', 'break', 'senate', 'senate', 'vote', 'devos', 'highly', 'contentious', 'nomination', 'afternoon', 'tally', 'split', 'evenly', 'require', 'penny', 'authority', 'president', 'upper', 'chamber', 'congress', 'break', 'impasse', 'president', 'break', 'confirm', 'cabinet_nominee', 'pence', 'count', 'vote', 'render', 'tally', 'democrats', 'stage', 'marathon', 'speech', 'lawmaker', 'take', 'floor', 'additional', 'republican', 'devos', 'block', 'confirmation', 'imagine', 'bad', 'choice', 'elizabeth_warren', 'letter', 'constituent', 'urge', 'devos', 'stir', 'vehement', 'opposition', 'teacher', 'union', 'senate', 'democrat', 'cite', 'concern', 'support', 'school', 'voucher', 'critic', 'believe', 'weaken', 'public', 'school', 'experience', 'attend', 'work', 'public', 'education', 'system', 'cite', 'familiarity', 'la

#### Create the Dictionary and Corpus needed for Topic Modeling
- Word to IDs mapping
- Bag of words of each document
- corpus (cluster of Bag of words of all the documents)

In [42]:
#Creates Word to IDs mapping
word2id = corpora.Dictionary(lemma_doc)
print(word2id)

Dictionary(30441 unique tokens: ['     ', 'account', 'activist', 'addition', 'additional']...)


In [13]:
# Creates bag of words and a corpus
documents = lemma_doc
corpus = [word2id.doc2bow(doc) for doc in documents]

print('Corpus sample')
sample = corpus[0]
for i in range(len(sample)):
    print('Word', sample[i][0], ':', word2id[sample[i][0]], ' || Number of occurences:', sample[i][1])

Corpus sample
Word 0 :        || Number of occurences: 1
Word 1 : account  || Number of occurences: 1
Word 2 : activist  || Number of occurences: 1
Word 3 : addition  || Number of occurences: 1
Word 4 : additional  || Number of occurences: 1
Word 5 : administration  || Number of occurences: 1
Word 6 : administrator  || Number of occurences: 1
Word 7 : afternoon  || Number of occurences: 1
Word 8 : alaska  || Number of occurences: 1
Word 9 : announce  || Number of occurences: 2
Word 10 : answer  || Number of occurences: 1
Word 11 : appreciate  || Number of occurences: 1
Word 12 : attend  || Number of occurences: 1
Word 13 : authority  || Number of occurences: 1
Word 14 : average  || Number of occurences: 1
Word 15 : bad  || Number of occurences: 1
Word 16 : believe  || Number of occurences: 1
Word 17 : betsy  || Number of occurences: 1
Word 18 : billionaire  || Number of occurences: 1
Word 19 : block  || Number of occurences: 1
Word 20 : board  || Number of occurences: 1
Word 21 : break

#### Hierarchical Dirichlet Processing
This is kind of an unsupervised technique (Topic modeling is a unsupervised technique. Here the context is we don't decide the # of topics. In concept this is similar to Hierarchical cluster as don't choose the number of cluster before hand) as the model will identify the number of topics. Let's see what it will produce.

In [None]:
hdp = models.HdpModel(corpus,word2id)

In [None]:
hdp_topics = hdp.print_topics()
for topic in hdp_topics:
    print(topic)

(0, '0.009*trump + 0.006*people + 0.006*president + 0.005*state + 0.005*country + 0.004*government + 0.004*house + 0.003*report + 0.003*china + 0.003*year')
(1, '0.007*trump + 0.006*people + 0.005*china + 0.004*president + 0.003*state + 0.003*country + 0.003*government + 0.003*house + 0.003*year + 0.003*report')
(2, '0.008*china + 0.005*country + 0.005*people + 0.005*trump + 0.004*state + 0.004*chinese + 0.004*president + 0.003*government + 0.003*year + 0.003*group')
(3, '0.006*china + 0.004*people + 0.004*country + 0.003*government + 0.003*percent + 0.002*report + 0.002*state + 0.002*chinese + 0.002*year + 0.002*find')
(4, '0.003*trump + 0.002*state + 0.002*china + 0.002*german + 0.002*president + 0.002*world + 0.002*include + 0.002*country + 0.002*chinese + 0.002*chestnut')
(5, '0.006*china + 0.003*trump + 0.003*president + 0.002*state + 0.002*country + 0.002*trade + 0.001*european + 0.001*island + 0.001*call + 0.001*minister')
(6, '0.011*target + 0.009*attacker + 0.007*wound + 0.006

In [None]:
print('HDP model created: '+str(len(hdp_topics))+' Topics')

HDP model created: 20 Topics


####  Latent Dirichlet Allocation Model


In [14]:
#lda_model = LdaModel(corpus=corpus, id2word=word2id, num_topics=5, random_state=42, update_every=1, chunksize=100, 
#                     passes=10, alpha='auto', per_word_topics=True)
lda_model = LdaModel(corpus=corpus, id2word=word2id, num_topics=5, random_state=42, update_every=1, chunksize=100, 
                     passes=10, alpha='auto')

In [15]:
#Article - Topic Distribution for first Article
def get_article_topic_distribution(article):
    return lda_model.get_document_topics(article)
#Returns a list containing a list of tuple
#Each inner list corresponds to an article and each tuple refers to topicID and its corresponding probability  
map(get_article_topic_distribution, corpus)

<map at 0x21d886f3e20>

In [16]:
lda_model.print_topics()
#lda_model.
#a=lda_model.get_document_topics(corpus)
#print(a)
#https://humboldt-wi.github.io/blog/research/information_systems_1819/is_lda_final/

[(0,
  '0.010*"percent" + 0.009*"government" + 0.008*"budget" + 0.008*"country" + 0.006*"increase" + 0.005*"european" + 0.005*"economic" + 0.005*"year" + 0.005*"market" + 0.005*"trade"'),
 (1,
  '0.021*"trump" + 0.012*"president" + 0.009*"state" + 0.008*"party" + 0.007*"house" + 0.007*"russian" + 0.006*"election" + 0.006*"country" + 0.005*"russia" + 0.005*"order"'),
 (2,
  '0.013*"people" + 0.006*"woman" + 0.006*"year" + 0.005*"child" + 0.005*"family" + 0.005*"think" + 0.005*"health" + 0.004*"school" + 0.004*"learn" + 0.004*"world"'),
 (3,
  '0.043*"china" + 0.020*"korea" + 0.019*"chinese" + 0.013*"north" + 0.011*"japan" + 0.011*"missile" + 0.010*"south" + 0.007*"brand" + 0.007*"beijing" + 0.007*"system"'),
 (4,
  '0.012*"police" + 0.008*"north" + 0.008*"attack" + 0.008*"force" + 0.007*"syrian" + 0.007*"people" + 0.007*"report" + 0.007*"group" + 0.006*"kill" + 0.006*"south"')]

### How to interpret this?
The top 10 keywords that contribute to the topic are showcased with their respective weight.

Let's try to interpret the 5 topics:

- Topic 1: key words like "Russia", "Country", "Government", "Minister" suggest **Politics in Russia**
- Topic 2: key words like "China", "Brexit","Trade", "Business", "Market" suggest **Inter country trade news**
- Topic 3: key words like "Player","Sport","World" suggest **Sports news (football)**
- Topic 4: key words like "People","Woman","Police", "Family,"Child" suggest **Domestic news**
- Topic 5: key words like "Trump", "State", "White", "Committee" suggest **Polictics in USA**

### Compute Model Perplexity and Coherence Score

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
#coherence_model_lda = models.CoherenceModel(model=lda_model, texts=lemma_doc, dictionary=word2id, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.423501550211952


Coherence measures the relative distance between words within a topic. There are two major types C_V typically 0 < x < 1 and uMass -14 < x < 14.
Coherence score of 0.4 is low. I want to explore what would have been the ideal number of topics. Will explore the elbow method below. 

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, update_every=1, chunksize=100, 
                     passes=10, alpha='auto', per_word_topics=False)
        model_list.append(model)

        #coherencemodel = models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        #coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=word2id, corpus=corpus, texts=lemma_doc, start=2, limit=100, step=10)
model_list

KeyboardInterrupt: 

In [17]:
#https://towardsdatascience.com/lets-build-an-article-recommender-using-lda-f22d71b7143e
def get_similarity(lda, query_vector):
    index = similarities.MatrixSimilarity(lda[corpus])
    
    sims = index[query_vector]
    return sims

In [74]:
#https://github.com/RaRe-Technologies/gensim/issues/2644
# its taking me 3 days for this fucking issue!!!!
query="Inspired By Erotic Arabic Poetry, Women Artists Depict Radical Love"
#query="Trump Says White House Could Mean Millions for Brand"
print(query.lower().split())
#words = word2id.doc2bow(query.split())
words = word2id.doc2bow(query.lower().split())
print(words)
print(word2id)


print("Top words identified: ")
for word in words:
    print("{} {}".format(word[0], word2id[word[0]]))

print("from here!!!!")
query_vector = lda_model[words]
print(query_vector)

sims = get_similarity(lda_model, query_vector)

sims = sorted(enumerate(sims), key=lambda item: -item[1])


['inspired', 'by', 'erotic', 'arabic', 'poetry,', 'women', 'artists', 'depict', 'radical', 'love']
[(680, 1), (1835, 1), (2460, 1), (2493, 1), (4700, 1)]
Dictionary(30441 unique tokens: ['     ', 'account', 'activist', 'addition', 'additional']...)
Top words identified: 
680 arabic
1835 love
2460 radical
2493 depict
4700 erotic
from here!!!!
[(0, 0.051980518), (1, 0.15822597), (2, 0.5722676), (3, 0.014561373), (4, 0.20296451)]


In [75]:
print(len(sims))
#print(sims)
print(sims[1])
print(sims[0][0])
print(sims[1][0])
for i in range(0,10):
    print(sims[i])

2673
(692, 0.9974572)
689
692
(689, 0.9980267)
(692, 0.9974572)
(2439, 0.99565595)
(81, 0.99429584)
(2255, 0.9941871)
(2651, 0.9938768)
(17, 0.99250656)
(728, 0.99131095)
(54, 0.9908147)
(674, 0.98875606)


In [76]:
idx = 0
pids = []
result = 10
article_ids = df['article_source_link'].values.tolist()

print("\nCheck out the links below:")
while result > 0:
    article_id = article_ids[sims[idx][0]]
    
    if article_id not in pids:
        pids.append(article_id)
        print("{}".format(article_id))
        result -= 1
    idx += 1


Check out the links below:
http://www.cnn.com/2017/02/09/health/dog-dna-death-penalty-eprise/index.html
http://www.cnn.com/2017/02/09/tennis/andy-murray-roger-federer-andy-murray-live/index.html
http://www.huffingtonpost.com/2017/03/15/rhythmic-gymnast-abuse-team-usa_n_15390974.html
http://www.bbc.co.uk/news/uk-england-38891475
http://www.huffingtonpost.com/2017/03/13/michael-brown-unedited-video_n_15346390.html
http://www.huffingtonpost.com/2017/03/16/mischa-barton-takes-legal-action-over-revenge-porn_n_15401950.html
http://abcnews.go.com/Sports/tom-brady-missing-super-bowl-jersey-shows-ebay/story?id=45299006
http://www.bbc.co.uk/news/world-australia-38916464
http://www.cnn.com/2016/04/21/us/project-vic-child-abuse/index.html
http://abcnews.go.com/Entertainment/anna-nicole-smiths-daughter-now-10-fearless-mom/story?id=45329494


In [73]:
#print(len(corpus))
#doc_topic_dist =documents_topic_distribution()
#doc_topic_dist
corpus[0]
#documents[0]
#%%cache mycache_lda_index.pkl index
#index = similarities.MatrixSimilarity(lda_model[corpus])

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 2),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 3),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 2),
 (31, 2),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 2),
 (36, 3),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 9),
 (48, 1),
 (49, 6),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 2),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 2),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 2),
 (83, 1),
 (84, 3),
 (85, 1),
 (86, 1),
 (87, 2),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 2),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 4),
 (97, 1),
 (98, 1),
 (99, 4),
 (100, 1),

In [None]:
#Use this to get the graph of optimal # of topics
model_list, coherence_values = compute_coherence_values(dictionary=word2id, corpus=corpus, texts=lemma_doc, start=2, limit=100, step=10)
# Show graph
import matplotlib.pyplot as plt
limit=100; start=2; step=10;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

KeyboardInterrupt: 

You can decide on the number of topics based on this analysis. Note that the Customization used for 5 topic model (lda_model) and the optimization models is difference therefore the Coherence score for 5 topics LDA model differ.

### Vizualize the topics

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model,corpus,word2id)