### Topic modeling using Latent Dirichlet Allocation

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import models, corpora

In [10]:
def load_data(input_file):
    data = []
    with open(input_file) as f:
        for line in f.readlines():
            data.append(line[:-1])
            
    return data

In [3]:
def process(input_text):
    #Processor function for tokenizing, removing stop words and stemming
    tokenizer = RegexpTokenizer(r'\w+')   #Create a regular expression tokenizer
    tokens = tokenizer.tokenize(input_text.lower())    #Tokenize the input string
    
    stemmer = SnowballStemmer('english')
    
    #Get the list of stop-words which would be removed since they don't add information
    stop_words = stopwords.words('english')
    
    #Remove the stop words and stem the tokenized words
    tokens = [token for token in tokens if not token in stop_words]
    tokens_stemmed = [stemmer.stem(token) for token in tokens]
    
    return tokens_stemmed

In [11]:
data = load_data('data/data_topic_modelling.txt')

In [13]:
#Create a list for sentence tokens
tokens = [process(x) for x in data]

#Create a dictionary based on the sentence tokens
dict_tokens = corpora.Dictionary(tokens)

#Create a document-term matrix
doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]

In [20]:
#Through observation, the input_text has two distinct topics
num_topics = 2

#Generate the Latent Dirichlet Allocation model
ldamodel = models.ldamodel.LdaModel(doc_term_mat, num_topics = num_topics, id2word = dict_tokens, passes = 25)

In [22]:
num_words = 5
print(f'Top {num_words} contributing words to each topic')

for item in ldamodel.print_topics(num_topics = num_topics, num_words = num_words):
    print('\nTopic', item[0])
    
    #Print the contributing words along with their relative contributions
    list_of_strings = item[1].split(' + ')
    for text in list_of_strings:
        weight = text.split('*')[0]
        word = text.split('*')[1]
        print(word, '==>', round(float(weight) * 100, 2), '%')

Top 5 contributing words to each topic

Topic 0
"mathemat" ==> 3.1 %
"call" ==> 3.1 %
"set" ==> 1.9 %
"structur" ==> 1.9 %
"histor" ==> 1.9 %

Topic 1
"empir" ==> 3.9 %
"cultur" ==> 2.8 %
"europ" ==> 2.8 %
"expand" ==> 2.8 %
"formul" ==> 1.7 %
