Original idea from: https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/


In this post, we discuss techniques to visualize the output and results from topic model (LDA) based on the gensim package. I will be using a portion of the 20 Newsgroups dataset since the focus is more on approaches to visualizing the results.

Let’s begin by importing the packages and the 20 News Groups dataset.

In [1]:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
#Instructions about how to install mallet are available here: http://mallet.cs.umass.edu/download.php

'''
Windows installation: After unzipping MALLET, set the environment variable %MALLET_HOME% to point to the MALLET directory.
In all command line examples, substitute bin\mallet for bin/mallet.
'''
import os
from gensim.models.wrappers import LdaMallet


path_to_mallet_binary = "C:\\mallet-2.0.8\\bin\\mallet"
os.environ.update({'MALLET_HOME':r'C:\mallet-2.0.8'}) #OJO!, por alguna razon mallet solo puede estar disponible en esa carpeta


## Import NewsGroups Dataset


In [3]:
#pd.set_option('display.max_colwidth', -1) This line is optional to see the full width of the column content

In [4]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')


In [5]:
df = df.sample(1000)  #remove this line

In [6]:
df.head()

Unnamed: 0,content,target,target_names
977,From: nbetz@csi.compuserve.com (Nathan Betz)\n...,8,rec.motorcycles
1282,From: luriem@alleg.edu(Michael Lurie) The Libe...,9,rec.sport.baseball
10765,From: sera@zuma.UUCP (Serdar Argic)\nSubject: ...,17,talk.politics.mideast
1702,From: bgendler@opus.starlab.CSc.COM (Bruce Gen...,5,comp.windows.x
5730,From: HO@kcgl1.eng.ohio-state.edu (Francis Ho)...,6,misc.forsale


## Tokenize Sentences and Clean

Removing the emails, new line characters, single quotes and finally split the sentence into a list of words using gensim’s simple_preprocess(). Setting the deacc=True option removes punctuations.

In [7]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])
# [['from', 'irwin', 'arnstein', 'subject', 're', 'recommendation', 'on', 'duc', 'summary', 'whats', 'it', 'worth', 'distribution', 'usa', 'expires', 'sat', 'may', 'gmt', ...trucated...]]

[['from', 'nathan', 'betz', 'subject', 'first', 'bike', 'honda', 'ascot', 'organization', 'compuserve', 'incorporated', 'lines', 'hi', 'folks', 'im', 'going', 'to', 'be', 'buying', 'my', 'first', 'bike', 'and', 'im', 'considering', 'an', 'honda', 'ascot', 'ft', 'with', 'less', 'than', 'miles', 'does', 'this', 'sound', 'like', 'reasonable', 'choice', 'is', 'there', 'anything', 'special', 'need', 'to', 'know', 'thanks', 'nathan']]


## 4. Build the Bigram, Trigram Models and Lemmatize


Let’s form the bigram and trigrams using the Phrases model. This is passed to Phraser() for efficiency in speed of execution.

Next, lemmatize each word to its root form, keeping only nouns, adjectives, verbs and adverbs.

We keep only these POS tags because they are the ones contributing the most to the meaning of the sentences. Here, I use spacy for lemmatization.

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []    
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

# Build the topic model

In [9]:
#with genim and mallet

In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]




In [11]:
lda_model = LdaMallet(path_to_mallet_binary, corpus=corpus, num_topics=20, id2word=id2word)

To build the LDA topic model using LdaModel(), you need the corpus and the dictionary. Let’s create them first and then build the model. The trained topics (keywords and weights) are printed below as well.



In [12]:
pprint(lda_model.print_topics())

[(0,
  '0.026*"read" + 0.024*"write" + 0.022*"time" + 0.021*"rate" + 0.020*"bit" + '
  '0.019*"give" + 0.016*"test" + 0.015*"design" + 0.013*"power" + '
  '0.013*"build"'),
 (1,
  '0.031*"true" + 0.025*"argument" + 0.024*"thing" + 0.021*"claim" + '
  '0.018*"question" + 0.013*"evidence" + 0.012*"truth" + 0.012*"conclusion" + '
  '0.012*"reason" + 0.011*"occur"'),
 (2,
  '0.128*"write" + 0.070*"article" + 0.028*"organization" + 0.019*"talk" + '
  '0.018*"reply" + 0.016*"opinion" + 0.015*"bad" + 0.014*"real" + '
  '0.014*"assume" + 0.014*"agree"'),
 (3,
  '0.022*"information" + 0.021*"internet" + 0.020*"computer" + 0.020*"privacy" '
  '+ 0.016*"email" + 0.015*"key" + 0.013*"encryption" + 0.013*"network" + '
  '0.013*"policy" + 0.012*"technology"'),
 (4,
  '0.030*"state" + 0.026*"law" + 0.020*"government" + 0.012*"full" + '
  '0.011*"order" + 0.010*"question" + 0.010*"citizen" + 0.010*"party" + '
  '0.009*"plan" + 0.009*"political"'),
 (5,
  '0.038*"write" + 0.017*"mail" + 0.016*"send" + 

# Get most relevant documents

In [33]:
matrix_documents_topic_contribution= pd.DataFrame()
for i, row in enumerate(lda_model[corpus]): #i = index_row, row = (topic_index, contribution)
    matrix_documents_topic_contribution = matrix_documents_topic_contribution.append(pd.DataFrame(pd.DataFrame(row).T.iloc[1,:]).T, ignore_index=True)

In [34]:
matrix_documents_topic_contribution.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.042373,0.048023,0.055556,0.042373,0.042373,0.061205,0.042373,0.059322,0.066855,0.042373,0.042373,0.055556,0.080038,0.053672,0.046139,0.042373,0.046139,0.042373,0.046139,0.042373
1,0.038889,0.043651,0.148413,0.043651,0.035714,0.059524,0.061111,0.057937,0.09127,0.042063,0.035714,0.037302,0.037302,0.038889,0.037302,0.037302,0.037302,0.037302,0.042063,0.037302
2,0.0268,0.016628,0.011541,0.031495,0.326487,0.012715,0.01741,0.018975,0.029147,0.058099,0.008803,0.0268,0.01428,0.054577,0.012715,0.015454,0.018584,0.236502,0.015454,0.047535
3,0.035948,0.063399,0.051634,0.050327,0.034641,0.032026,0.047712,0.037255,0.029412,0.041176,0.029412,0.043791,0.05817,0.039869,0.188889,0.064706,0.029412,0.043791,0.034641,0.043791
4,0.053105,0.038399,0.051471,0.041667,0.043301,0.072712,0.036765,0.044935,0.036765,0.036765,0.036765,0.056373,0.061275,0.040033,0.125,0.066176,0.036765,0.038399,0.040033,0.043301


In [35]:
contents = pd.Series(df['content']).reset_index(drop=True)

In [36]:
matrix_documents_topic_contribution = pd.concat([matrix_documents_topic_contribution, contents], axis=1)


In [38]:
relevantDocumentsDict = matrix_documents_topic_contribution.to_dict(orient='records')

[{0: 0.042372881355932195,
  1: 0.048022598870056485,
  2: 0.05555555555555554,
  3: 0.042372881355932195,
  4: 0.042372881355932195,
  5: 0.061205273069679836,
  6: 0.042372881355932195,
  7: 0.05932203389830507,
  8: 0.06685499058380413,
  9: 0.042372881355932195,
  10: 0.042372881355932195,
  11: 0.05555555555555554,
  12: 0.08003766478342748,
  13: 0.053672316384180775,
  14: 0.04613935969868173,
  15: 0.042372881355932195,
  16: 0.04613935969868173,
  17: 0.042372881355932195,
  18: 0.04613935969868173,
  19: 0.042372881355932195,
  'content': "From: nbetz@csi.compuserve.com (Nathan Betz)\nSubject: First bike: Honda Ascot?\nOrganization: CompuServe Incorporated\nLines: 10\n\nHi folks.\n \nI'm going to be buying my first bike and I'm considering an 82\nHonda Ascot FT500 with less than 5K miles. Does this sound like a\nreasonable choice? Is there anything special I need to know?\n \nThanks.\n \n-Nathan\n\n"},
 {0: 0.038888888888888896,
  1: 0.043650793650793655,
  2: 0.1484126984126

# Topic similarity metric

In [49]:
from gensim.models import KeyedVectors


In [50]:
# Choose the # top keywords and # top documents a considerar en la metrica

topn = 20
topkdocuments = 20
relevance_lambda = 0.6 

In [54]:
ruta_word_embedding = '../data/wiki.multi.en.vec'
word_embedding_model = KeyedVectors.load_word2vec_format(ruta_word_embedding)

In [52]:
#Note, that vectors are going to be calculated according to topic order of PreparedData
def get_dicts_relevant_keywords_documents(lda_model,df_relevant_documents, n_terms, topic_order, PreparedData_dict_with_more_info):
    num_topics = lda_model.num_topics
    topKeywordsDict = {}
    for topic_id in range(num_topics):        
        topKeywordsDict[topic_id] = []        
        def save_relevant_keywords_in_dict(row):
            topKeywordsDict[topic_id].append({  #el topic_id, debe ser segun el orden de lda_model
                "term":row['Term'],
                "relevance":row['relevance']
            })
            
        topic_on_tinfo = topic_order.index(topic_id+1)+1    
        PreparedData_dict_with_more_info.loc[PreparedData_dict_with_more_info['Category'] == 'Topic'+str(topic_on_tinfo)].sort_values(by='relevance', ascending=False)[['Term','relevance']][:n_terms].apply(save_relevant_keywords_in_dict, axis=1)    
    return topKeywordsDict


def getDocumentVector(text, wordembedding,  topic_id , topic_order, PreparedData_dict_with_more_info):
    topic_on_tinfo = topic_order.index(topic_id+1)+1    
    list_terms_relevance = PreparedData_dict_with_more_info.loc[PreparedData_dict_with_more_info['Category'] == 'Topic'+str(topic_on_tinfo)].sort_values(by='relevance', ascending=False)['Term'].tolist()
    document_vector = 0.0
    words_found = set()
    for word in text_cleaner(text):    
        if word in list_terms_relevance:
            raking_word = float(list_terms_relevance.index(word)+1)
            if word in wordembedding.wv:
                #print("WORD FOUND", word)
                document_vector+=wordembedding.wv[word]/raking_word #aqui hay que ponderar
                words_found.add(word.upper())
            else:
                print("WARNING, Word not found:", word)    
    return document_vector
            
def get_topkeywords_relevantdocuments_vectors(wordembedding, lda_model,most_relevant_documents,  n_terms, topic_order, PreparedData_dict_with_more_info, topkdocuments): #n_terms : numero de top keywords a considerar
    topKeywordsDict = get_dicts_relevant_keywords_documents(lda_model, most_relevant_documents, n_terms, topic_order,  PreparedData_dict_with_more_info)

    ##Create top keyword vector per topic
    topkeywords_vectors_dict = {}
    num_topics = lda_model.num_topics
    for topic_id in range(num_topics):
        topkeywords_vector = 0
        ranking = 1.0
        for item in topKeywordsDict[topic_id]:
            if item['term'] in wordembedding.wv: 
                topkeywords_vector += wordembedding.wv[item['term']]/ranking
            else:
                print("WARNING NOT FOUND: ", item['term']," position:",ranking)
            ranking+=1
        topkeywords_vectors_dict[topic_id] = topkeywords_vector
        
    #Create a top relevant document vector    
    relevantdocuments_vectors_dict = {}
    for topic_id in range(num_topics):
        real_topic_id = topic_id# 
        #real_topic_id = topic_order[topic_id]-1
        relevantDocumentsvector = 0.0
        j = 0
        for index, item in matrix_document_topic_distribution.sort_values(by=[str(real_topic_id)], ascending=False)[[str(real_topic_id),'texto_completo']][0:topkdocuments].iterrows():
            j+=1                                            
            relevantDocumentsvector+= float(item[str(real_topic_id)])*getDocumentVector(item['texto_completo'], wordembedding, topic_id, topic_order, PreparedData_dict_with_more_info) #PODRIA HACER UNA ESPECIE DE RANKING, SIMILAR A LO QUE HICE CON LAS TOP KEYWORDS.
        relevantdocuments_vectors_dict[topic_id] = relevantDocumentsvector        
    return (topkeywords_vectors_dict, relevantdocuments_vectors_dict)

#Here, we calculate once the topkeywords_vector and the relevant documents_vector for each topic
#We are going to calculate several times:      #final topic vector = (lambda)topic_keyword_vector + (lambda-1)topic_document_vector
#because we are going to try different lambda (between 0 and 1)
def get_topic_vectors(wordembedding, lda_model,most_relevant_documents,  n_terms, lambda_, topic_order, PreparedData_dict_with_more_info, topkdocuments):
    num_topics = lda_model.num_topics
    topkeywords_vectors_dict, relevantdocuments_vectors_dict = get_topkeywords_relevantdocuments_vectors(wordembedding, lda_model,most_relevant_documents,  n_terms, topic_order, PreparedData_dict_with_more_info, topkdocuments)
    final_topic_vectors_dict = dict()
    for topic_id in range(num_topics):
        final_topic_vector = lambda_*topkeywords_vectors_dict[topic_id]+(1-lambda_)*relevantdocuments_vectors_dict[topic_id]
        final_topic_vectors_dict[topic_id] = final_topic_vector
    return final_topic_vectors_dict

#This matrix is calculated by a specific lambda. 
def get_matrix_by_lambda(wordembedding, lda_model_1,most_relevant_documents_1,lda_model_2,most_relevant_documents_2, n_terms, lambda_, topic_order_1, topic_order_2, PreparedData_dict_with_more_info_1, PreparedData_dict_with_more_info_2, topkdocuments):
    #final topic vector = (lambda)topic_keyword_vector + (lambda-1)topic_document_vector
    final_topic_vectors_dict_1 =  get_topic_vectors(wordembedding, lda_model_1,most_relevant_documents_1,  n_terms, lambda_, topic_order_1, PreparedData_dict_with_more_info_1, topkdocuments)
    final_topic_vectors_dict_2 =  get_topic_vectors(wordembedding, lda_model_2,most_relevant_documents_2,  n_terms, lambda_, topic_order_2,  PreparedData_dict_with_more_info_2, topkdocuments)
    
    topic_similarity_matrix = []
    for i in range(lda_model_1.num_topics):
        row = []
        for j in range(lda_model_2.num_topics):
            topic_i = final_topic_vectors_dict_1[i].reshape(1,-1)
            topic_j = final_topic_vectors_dict_2[j].reshape(1,-1)
            row.append(float(cosine_similarity(topic_i,topic_j)))
        topic_similarity_matrix.append(row)
    topic_similarity_matrix= np.asarray(topic_similarity_matrix)
    return topic_similarity_matrix


In [53]:
def get_matrix(wordembedding, lda_model_1,most_relevant_documents_1,lda_model_2,most_relevant_documents_2, n_terms, topic_order_1, topic_order_2, PreparedData_dict_with_more_info_1, PreparedData_dict_with_more_info_2, topkdocuments):
    i = 0.0
    matrices_dict = dict()
    while i <=1.01:
        lambda_ = round(i*100/100,2)
        print(lambda_)
        matrix = get_matrix_by_lambda(wordembedding, lda_model_collecion_1, relevantDocumentsDict_1, lda_model_collecion_2, relevantDocumentsDict_2,topn, lambda_, topic_order_1, topic_order_2, tinfo_collection_1, tinfo_collection_2, topkdocuments)
        matrices_dict[lambda_] = matrix
        i+=0.01
    return matrices_dict

In [None]:
topic_similarity_matrix = get_matrix(word_embedding_model, lda_model,relevantDocumentsDict,lda_model,relevantDocumentsDict, n_terms, topic_order_1, topic_order_2, PreparedData_dict, PreparedData_dict, topkdocuments):

# Create prepared data