In [1]:
'''
Notebook created by: Gabriele Sottocornola
for the M.Sc. class of Data & Text Mining
'''
import re
import pandas as pd
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction import stop_words
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer



In [2]:
def reformat_gensim_topics(topics_list):
    '''
    Function to reformat the word-topic list given the output of gensim lda
    '''
    topics_report_string = ''
    
    for t in topics_list:        
        topic_id = str(t[0])
        t_word_tokens = t[1].split('+')        
        topics_report_string += 'Topic ' + topic_id + ', '
        
        for w in t_word_tokens:            
            topics_report_string += w.split('*')[1].strip() + ', '
            
        topics_report_string = topics_report_string[:-2] + '\n'
    
    return topics_report_string

In [3]:
def tokenization_preprocessing(corpus_list):
    '''
    Function to apply some standard preprocessing to a list of documents in a corpus.
    The preprocessing includes: split each document into a list of tokens, remove english stopwords,
    remove alphanumerical tokens shorter that 3 chars.
    Return a list of documents, each represented as a list of tokens
    '''
    #set tokenizer and stopwords filter
    tokenizer = RegexpTokenizer('[–\\w\\+\\-\\*′α-ωΑ-Ω]+')
    en_stop = stop_words.ENGLISH_STOP_WORDS
    
    doc_tokens_list = list() #list of tokens that represent each document

    for document in corpus_list:
        lower_doc = document.lower() #set all the document string to lowercase
        tokens = tokenizer.tokenize(lower_doc) #tokenize the document string
        stopped_tokens = [i for i in tokens if not i in en_stop] #filter out english stopwords
        final_tokens = [i for i in stopped_tokens if (len(i) > 2) and re.search('[a-zA-Z]', i)] #filter out tokens with len less than 3 and numbers

        doc_tokens_list.append(final_tokens)
        
    return doc_tokens_list

In [4]:
def get_common_words_list(doc_tokens_list, top_n_words=100):
    '''
    Function to get the list of the top_n_words most frequent words inside the corpus.
    Return a tuple with a list of the most frequent word labels and frequencies
    '''
    dictionary = corpora.dictionary.Dictionary(doc_tokens_list) #dictionary of words extracted from the corpus (list of tokens)
    bow = dictionary.doc2bow([token for doc in doc_tokens_list for token in doc])
    bow.sort(key=lambda x:x[1], reverse=True) #bow rapresentation of the dictionary sorted descending by frequency
    
    word_label_list = [dictionary[bow[x][0]] for x in range(top_n_words)]
    word_freq_list = [bow[x][1] for x in range(top_n_words)]
    
    return (word_label_list, word_freq_list)

In [5]:
def stem_doc_tokens(doc_tokens_list):
    '''
    Function to stem all the word tokens provided by the doc_tokens_list corpus.
    Return a list of documents, each represented as a list of stemmed tokens
    '''
    stem_doc_tokens_list = list()
    stemmer = PorterStemmer()
    
    for tokens_list in doc_tokens_list:
        stemmed_tokens = [stemmer.stem(token) for token in tokens_list]
        stem_doc_tokens_list.append(stemmed_tokens)
        
    return stem_doc_tokens_list

In [6]:
def filter_common_words(doc_tokens_list, list_filter_words):
    '''
    Function to filter out the most common words provided by list_filter_words from the doc_tokens_list corpus.
    Return a list of documents, each represented as a list of filtered tokens
    '''
    filtered_doc_tokens_list = list()
    
    for tokens_list in doc_tokens_list:        
        filtered_tokens_list = [token for token in tokens_list if not(token in list_filter_words)]
        filtered_doc_tokens_list.append(filtered_tokens_list)
    
    return filtered_doc_tokens_list

In [7]:
def topic_model_evaluation_pipeline(doc_tokens_list):
    
    '''
    Function that allow to apply a set of different topic models (with different parameters)
    and evaluate the coherence of the models according to qualitative and quantitive measures (PMI-based)
    '''
    dictionary = corpora.dictionary.Dictionary(doc_tokens_list) #dictionary of words extracted from the corpus (list of tokens)
    bow_corpus = [dictionary.doc2bow(tokens) for tokens in doc_tokens_list] #gensim bag-of-words representation of corpus
    
    #evaluation of the coherence and quality of different topic models
    for nt in [5, 10, 20, 30, 50]:

        print('GENSIM LDA WITH {} TOPICS \n'.format(nt))
        
        #apply LDA models and extract top10 most representative words for each topic
        ldamodel = LdaModel(bow_corpus, num_topics=nt, id2word=dictionary, alpha='asymmetric', minimum_probability=0.0001, iterations=1000)
        topics = ldamodel.print_topics(num_topics=-1, num_words=10)

        print('Top10 words for each topic:\n')
        print(reformat_gensim_topics(topics))
        
        #measure of topic coherence based on PMI (more details here: http://qpleple.com/topic-coherence-to-evaluate-topic-models/)
        c_uci = CoherenceModel(model=ldamodel, texts=doc_tokens_list, dictionary=dictionary, coherence='c_uci', topn=30)   
        u_mass = CoherenceModel(model=ldamodel, corpus=bow_corpus, coherence='u_mass', topn=30)

        print('UCI coherence for {} topics LDA = {}'.format(nt, c_uci.get_coherence()))
        print('UMASS coherence for {} topics LDA = {}'.format(nt, u_mass.get_coherence()))

        print('\n******************************************************************************\n')

In [8]:
###############################################################################################################################

In [9]:
#read textual documents from file
documents_path = '.\\data\\AssociatedPress.txt'
with open(documents_path, 'r', encoding='utf-8') as doc_f:
    corpus_list = doc_f.readlines()
corpus_list = [re.sub('\d+\|\|', '', doc) for doc in corpus_list] #list of documents in the corpus

In [10]:
#tokenize documents and remove stopwords
doc_tokens_list = tokenization_preprocessing(corpus_list)

In [11]:
#stem word tokens
stem_doc_tokens_list = stem_doc_tokens(doc_tokens_list)

In [12]:
#remove most frequent words
frequent_words_list = get_common_words_list(stem_doc_tokens_list, 50)[0]
final_doc_tokens_list = filter_common_words(stem_doc_tokens_list, frequent_words_list)

In [13]:
#apply and evaluate LDA models
topic_model_evaluation_pipeline(final_doc_tokens_list)

GENSIM LDA WITH 5 TOPICS 

Top10 words for each topic:

Topic 0, "democrat", "dollar", "dukaki", "vote", "rate", "bank", "south", "late", "leader", "campaign"
Topic 1, "case", "charg", "defens", "committe", "program", "offer", "comput", "judg", "north", "want"
Topic 2, "cent", "did", "drug", "attorney", "charg", "gener", "way", "dress", "campaign", "mecham"
Topic 3, "kill", "south", "monday", "militari", "right", "death", "home", "servic", "author", "oper"
Topic 4, "sale", "tax", "stock", "share", "depart", "gener", "industri", "increas", "foreign", "administr"

UCI coherence for 5 topics LDA = -0.4383156661665164
UMASS coherence for 5 topics LDA = -2.799180186770827

******************************************************************************

GENSIM LDA WITH 10 TOPICS 

Top10 words for each topic:

Topic 0, "senat", "committe", "germani", "meet", "right", "mecham", "world", "attorney", "nordstrom", "union"
Topic 1, "cent", "sale", "iraq", "higher", "oil", "lower", "econom", "soybea

In [14]:
#############################################################################################################################

## Take-aways

+ Topic models (i.e. LDA) provide powerful tools to represent documents in a lower dimensional space

+ Gensim package provides an LDA implementation based on online variational bayes (details in the paper: https://papers.nips.cc/paper/3902-online-learning-for-latent-dirichlet-allocation)

+ Some preprocess is often necessary/useful to "normalize" textual data (i.e. tokenization, stopwords filter, stemming, etc.)

+ Generated topics can be qualitatively interpreted and evaluated through their most representative words

+ Some coherence measurements based on PMI are introduced to automatically assess topics quality (more details here: http://qpleple.com/topic-coherence-to-evaluate-topic-models/)