In [1]:
'''
Notebook created by: Gabriele Sottocornola
for the M.Sc. class of Data & Text Mining
'''
import re
import numpy as np
from nltk.tokenize import RegexpTokenizer
from gensim import corpora
from gensim.models.wrappers import LdaMallet
from sklearn.feature_extraction import stop_words
from nltk.stem.porter import PorterStemmer

In [2]:
def tokenization_preprocessing(corpus_list):
    '''
    Function to apply some standard preprocessing to a list of documents in a corpus.
    The preprocessing includes: split each document into a list of tokens, remove english stopwords,
    remove alphanumerical tokens shorter that 3 chars.
    Return a list of documents, each represented as a list of tokens
    '''
    #set tokenizer and stopwords filter
    tokenizer = RegexpTokenizer('[–\\w\\+\\-\\*′α-ωΑ-Ω]+')
    en_stop = stop_words.ENGLISH_STOP_WORDS
    
    doc_tokens_list = list() #list of tokens that represent each document

    for document in corpus_list:
        lower_doc = document.lower() #set all the document string to lowercase
        tokens = tokenizer.tokenize(lower_doc) #tokenize the document string
        stopped_tokens = [i for i in tokens if not i in en_stop] #filter out english stopwords
        final_tokens = [i for i in stopped_tokens if (len(i) > 2) and re.search('[a-zA-Z]', i)] #filter out tokens with len less than 3 and numbers

        doc_tokens_list.append(final_tokens)
        
    return doc_tokens_list

In [3]:
def stem_doc_tokens(doc_tokens_list):
    '''
    Function to stem all the word tokens provided by the doc_tokens_list corpus.
    Return a list of documents, each represented as a list of stemmed tokens
    '''
    stem_doc_tokens_list = list()
    stemmer = PorterStemmer()
    
    for tokens_list in doc_tokens_list:
        stemmed_tokens = [stemmer.stem(token) for token in tokens_list]
        stem_doc_tokens_list.append(stemmed_tokens)
        
    return stem_doc_tokens_list

In [4]:
def get_common_words_list(doc_tokens_list, top_n_words=100, top_freq=0.0):
    '''
    Function to get the list of the top_n_words most frequent words inside the corpus.
    Could filter out words based also on a relative frequency threshold.
    Return a tuple with a list of the most frequent word labels and frequencies
    '''
    dictionary = corpora.dictionary.Dictionary(doc_tokens_list) #dictionary of words extracted from the corpus (list of tokens)
    bow = dictionary.doc2bow([token for doc in doc_tokens_list for token in doc])
    bow.sort(key=lambda x:x[1], reverse=True) #bow rapresentation of the dictionary sorted descending by frequency
    
    word_label_list = [dictionary[x[0]] for x in bow]
    word_freq_list = [x[1] for x in bow]    
    if top_freq:
        total_freq = float(sum(word_freq_list))
        word_freq_list = [freq / total_freq for freq in word_freq_list]
        cum_freq_list = np.cumsum(word_freq_list)
        cum_freq_list = cum_freq_list[cum_freq_list <= top_freq]
        top_n_words = len(cum_freq_list)
    
    print('number of most common words to filter out: {}'.format(top_n_words))
    word_label_list = word_label_list[:top_n_words]
    word_freq_list = word_freq_list[:top_n_words]

    return (word_label_list, word_freq_list)

In [5]:
def filter_common_words(doc_tokens_list, list_filter_words):
    '''
    Function to filter out the most common words provided by list_filter_words from the doc_tokens_list corpus.
    Return a list of documents, each represented as a list of filtered tokens
    '''
    filtered_doc_tokens_list = list()
    
    for tokens_list in doc_tokens_list:        
        filtered_tokens_list = [token for token in tokens_list if not(token in list_filter_words)]
        filtered_doc_tokens_list.append(filtered_tokens_list)
    
    return filtered_doc_tokens_list

In [6]:
def topic_modeling(train_corpus, num_topics, label, mallet_path='/Users/gab/Documents/mallet-2.0.8/bin/mallet', test_corpus=None):
    '''
    Perform a topic modeling with num_topics on train corpus exploiting MalletLda wrapper. Perform
    topic inference on unseen documents in test corpus.
    See folder 'mallet_output' for results: doc_topic distribution and topic_word_weights
    '''
    # label or prefix to identify the specific mallet files for this run
    label = './mallet_output/' + label
    # dictionary of words extracted from the train corpus (list of tokens)
    dictionary = corpora.dictionary.Dictionary(train_corpus)
    # gensim bag-of-words representation of the train corpus
    bow_train_corpus = [dictionary.doc2bow(tokens) for tokens in train_corpus]    
    if test_corpus:
        # gensim bag-of-words representation of the test corpus
        bow_test_corpus = [dictionary.doc2bow(tokens) for tokens in test_corpus]

    num_dict_words = len(dictionary)
    print('number of word in the dictionary: {}'.format(num_dict_words))
    LDA_model = LdaMallet(mallet_path, corpus=bow_train_corpus, num_topics=num_topics, id2word=dictionary,
                          optimize_interval=10, iterations=1000, prefix=label)
    if test_corpus:
        doc_topic_test = LDA_model[bow_test_corpus]
    create_topic_word_weights_file(LDA_model, num_topics, num_dict_words, label)
    
    return LDA_model

In [7]:
def create_topic_word_weights_file(LDA_model, num_topics, num_dict_words, label):
    '''
    Utility function to convert the topic word representation of LDA_model print_topic()
    into the actual Mallet representation of the word weights for each topic and each word
    in the dictionary.
    It writes the topic_word_weights file into the mallet directory provided by label.
    '''
    tww_string = ''
    for topic in range(num_topics):
        topic_string = LDA_model.print_topic(topic, topn=num_dict_words)
        topic_word_weight_list = topic_string.split('+')

        for weight_word in topic_word_weight_list:
            weight = weight_word.split('*')[0].strip()
            word = weight_word.split('*')[1].strip()
            word = word[1:-1]
            tww_string += '{}\t{}\t{}\n'.format(str(topic), word, str(weight))

    with open(label + 'wordweights.txt', 'w', encoding='utf-8') as tww_file:
        tww_file.write(tww_string)

In [8]:
def show_topic_keys(topic_path, k=5):
    '''
    Given the path to the Mallet-like topickeys file,
    return the list of k most relevant words for each topic
    '''
    topic_keys_str = ''.format('utf-8')
    with open(topic_path) as topic_file:
        topic_lines = topic_file.readlines()
    for topic in topic_lines:
        topic_tokens = topic.split('\t')
        
        topic_id = topic_tokens[0]
        topic_keys_str += 'TOPIC {}: '.format(topic_id)
        word_tokens = topic_tokens[2].split(' ')
        
        for word in word_tokens[:k]:
            topic_keys_str += word + ', '
        topic_keys_str = topic_keys_str[:-2] + '\n'
        
    return topic_keys_str
     

In [9]:
#############################################################################################################################

In [10]:
# PARAMETERS #
NUM_TOPICS = 25

MALLET_LABEL = 'AssociatedPress_{}topics_'.format(NUM_TOPICS)
STEMMING = True
#TOP_WORDS_FILTER = 10
TOP_WORDS_FREQ = 0.2

In [11]:
#read textual documents from file
documents_path = './data/AssociatedPress.txt'
with open(documents_path, 'r', encoding='utf-8') as doc_f:
    corpus_list = doc_f.readlines()

In [12]:
#tokenize documents and remove stopwords
doc_tokens_list = tokenization_preprocessing(corpus_list)

In [14]:
#stem word tokens and remove most frequent words
if STEMMING:
    stem_doc_tokens_list = stem_doc_tokens(doc_tokens_list)
    frequent_words_list = get_common_words_list(stem_doc_tokens_list, top_freq=TOP_WORDS_FREQ)[0]
    final_doc_tokens_list = filter_common_words(stem_doc_tokens_list, frequent_words_list)
else:
    frequent_words_list = get_common_words_list(doc_tokens_list, top_freq=TOP_WORDS_FREQ)[0]
    final_doc_tokens_list = filter_common_words(doc_tokens_list, frequent_words_list)

number of most common words to filter out: 96


In [15]:
topic_modeling(final_doc_tokens_list, num_topics=NUM_TOPICS, label=MALLET_LABEL)

number of word in the dictionary: 26980


<gensim.models.wrappers.ldamallet.LdaMallet at 0x108b4fd68>

In [16]:
print(show_topic_keys('./mallet_output/{}topickeys.txt'.format(MALLET_LABEL), k=10))

TOPIC 0: hospit, children, medic, health, diseas, drug, patient, doctor, care, mother
TOPIC 1: dukaki, campaign, jackson, republican, candid, presidenti, poll, governor, primari, convent
TOPIC 2: africa, african, black, mandela, india, white, independ, apartheid, indian, anc
TOPIC 3: tax, money, fund, loan, pay, incom, bond, file, debt, corpor
TOPIC 4: film, music, art, award, movi, record, perform, play, theater, star
TOPIC 5: rule, law, judg, appeal, attorney, violat, lawyer, file, lawsuit, claim
TOPIC 6: attorney, trial, investig, sentenc, prison, convict, judg, alleg, crime, prosecutor
TOPIC 7: build, wine, earthquak, club, homeless, quak, center, popul, white, editor
TOPIC 8: panama, network, noriega, broadcast, nbc, game, abc, televis, cb, haiti
TOPIC 9: water, river, southern, mile, area, rain, coast, wind, high, north
TOPIC 10: dollar, cent, oil, rose, point, index, exchang, late, higher, fell
TOPIC 11: japan, farmer, agricultur, farm, china, japanes, product, food, ton, soybea