In [146]:
# Loading the data #

data_path = 'movies-pp.txt'

file = open(data_path).readlines()
count =  int(file[0].rstrip('\n'))
document = [file[i].rstrip('\n').split() for i in range(1,len(file))]

In [76]:
assert len(document) == count , 'Mismatch in number of documents'

In [77]:
# Getting the list of unique words in the document #

unique_words = []
for doc in document:
    unique_words.extend(doc)
    
unique_words = list(set(unique_words))

In [78]:
print("Unique words: ", len(unique_words))

Unique words:  46517


In [79]:
# Dictionary for storing the words and their index #

word_dict = dict(zip(unique_words, range(len(unique_words))))

In [80]:
import numpy as np

document_list = []
for doc in document:
    temp = [word_dict[word] for word in doc] # replacing words with its index #
    document_list.append(np.asarray(temp))

In [112]:
# Hyperparameter values #

alpha = 0.02
beta=0.1
num_topics=20
num_epochs=500
num_top_words=10

In [82]:
# Random Initialization #

from collections import defaultdict

num_words = len(unique_words) # number of unique words
num_documents = len(document_list) # number of documents
topic_word = np.zeros(shape=(num_topics, num_words)) # topic-word distribution #
document_topic = np.zeros(shape=(num_documents, num_topics)) # document-topic distribution #
topic = np.zeros(num_topics)
docs = np.zeros(num_documents)
topic_dict = defaultdict(int)

for i in range(0, num_documents):
    for j, word in enumerate(document_list[i]):
        z = np.random.randint(num_topics)
        document_topic[i, z] += 1
        docs[i] += 1
        topic_word[z, word] += 1
        topic[z] += 1
        topic_dict[(i, j)] = z

In [83]:
# Gibbs sampling #
# Computing probability using Gibbs sampling #
def gibbs_sample(word, doc_id, alpha, beta, num_topics, num_words, topic_word, topic, document_topic, docs):
    p_z = ((topic_word[:, word] + beta) / (topic + (num_words * beta))) * ((document_topic[doc_id, :] + alpha) / (docs[doc_id] + (num_topics * alpha)))
    p_z /= np.sum(p_z)
    return p_z

In [84]:
# Training #

import time

total_time_taken_start = time.time()
for epoch in range(num_epochs):
    start_time = time.time()
    for i in range(0, num_documents):
        for j, word in enumerate(document_list[i]):
            z = topic_dict[(i, j)]
            document_topic[i, z] -= 1
            docs[i] -= 1
            topic_word[z, word] -= 1
            topic[z] -= 1
            
            p_z = gibbs_sample(word, i, alpha, beta, num_topics, num_words, topic_word, topic, document_topic, docs)
            
            z = np.random.choice(np.arange(num_topics), p=p_z)
            topic_dict[(i, j)] = z
            document_topic[i, z] += 1
            docs[i] += 1
            topic_word[z, word] += 1
            topic[z] += 1

    print("Epoch: {}, time: {} s".format(epoch + 1, time.time() - start_time))

total_time_taken_end = time.time()
print("Total time taken (in mins) : ", (total_time_taken_end - total_time_taken_start)/60 )

Epoch: 1, time: 27.32591485977173 s
Epoch: 2, time: 29.744338750839233 s
Epoch: 3, time: 26.684393644332886 s
Epoch: 4, time: 27.15359139442444 s
Epoch: 5, time: 26.204424381256104 s
Epoch: 6, time: 27.096736192703247 s
Epoch: 7, time: 26.62901759147644 s
Epoch: 8, time: 27.229427099227905 s
Epoch: 9, time: 26.604106903076172 s
Epoch: 10, time: 26.16501498222351 s
Epoch: 11, time: 26.010724306106567 s
Epoch: 12, time: 25.99938988685608 s
Epoch: 13, time: 27.08127784729004 s
Epoch: 14, time: 26.067838191986084 s
Epoch: 15, time: 25.997677087783813 s
Epoch: 16, time: 25.987831115722656 s
Epoch: 17, time: 26.08024525642395 s
Epoch: 18, time: 26.14012360572815 s
Epoch: 19, time: 26.05844759941101 s
Epoch: 20, time: 26.484816551208496 s
Epoch: 21, time: 26.069823741912842 s
Epoch: 22, time: 26.237384796142578 s
Epoch: 23, time: 26.244136333465576 s
Epoch: 24, time: 26.007983207702637 s
Epoch: 25, time: 25.966712713241577 s
Epoch: 26, time: 26.01765012741089 s
Epoch: 27, time: 26.05763316154

In [86]:
# Dictionary containing the index and corresponding words #
id_word_dict = dict(zip(word_dict.values(), word_dict.keys()))

In [None]:
# Getting the most frequent words from each topic #

top_words_per_topic = []
for i in range(0, num_topics):
    topic_i = topic_word[i]
    top_words = np.argsort(topic_i)[::-1] [:num_top_words] # argsort in descending order
    temp = []
    for w in top_words:
        temp.append(id_word_dict[w])
    top_words_per_topic.append(temp)

In [88]:
# Writing to text file #

output_file = "./outputs/topic_words_output.txt"
file = open(output_file, 'w')
file.write("Total time taken (in mins) : " +str((total_time_taken_end - total_time_taken_start)/60)+"\n")
for i, words in enumerate(top_words_per_topic):
    file.write("Topic : " +str(i+1)+"\n")
    for w in words:
        file.write(w + "  ")
    file.write("\n\n")
file.close()

## Discussion of results

By analyzing the most frequent words provided from each topic (topic_words_output.txt), we can see some closely related words are placed in same topic. For example:

1) the words like political, president, country are place in topic 1 (related to politics)

2) words like love, life, romantic are placed in topic 5 (related to love)

3) words like action, jackie, chan, hong, kong, martial, fight are placed in topic 6 (related to action movie)

4) words like war, soldiers, private, battle are placed in topic 8 (related to battle)

5) words like scream, horror, killer are placed in topic 10 (related to horror movie)

6) words like alien, earth, planet, space, effects, mars  are placed in topic 12 (related to space)

7) words like disney, animation, story, toy are placed in topic 14 (related to animation movie)

LDA learns from the dataset in a unsupervised way and places closely related topic in a single topic. Here, Mostly the words in each topic are related to other words in the topic.

Also, one thing we can note is we can also find words with the same base words placed on same topic like alien - aliens (in topic 12) and animation - animated (in topic 14). I think by using stemming/lemmatization in preproceesing steps, we could remove these affixes and have only base words in the document.

Also, time taken for single iteration is 27 seconds (approx)

Time taken for 500 iterations in 222 minutes (3 hour and 45 mins (approx))

## Extra work

In [None]:
# Topic distribution across the document #
# Identifing how much each document is related to a particular topic #

In [89]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [90]:
top_distribution_across_document = []

for i in range(0, num_documents):
    document_i = document_topic[i]
    # Applying softmax function to get the probability distributions #
    softmax_out = softmax(document_i)
    top_distribution_across_document.append(softmax_out) # topic distribution#

In [92]:
output_file = "./outputs/topic_document_distribution_output.txt"

file = open(output_file, 'w')
for i, line in enumerate(top_distribution_across_document):
    file.write("Document : " +str(i+1)+"\n\n")
    for topic_num, t in enumerate(line):
        file.write("Topic "+str(topic_num) + ":\t"+str(t)+"\n")
    file.write("\n\n")
file.close()

In [None]:
# Trying with different hyperparameter values #

In [147]:
# LDA - varying the hyperparameter - num topics: 10 and number of iterartions: 250#

document = document = [file[i].rstrip('\n').split() for i in range(1,len(file))]

unique_words = []
for doc in document:
    unique_words.extend(doc)
    
unique_words = list(set(unique_words))
word_dict = dict(zip(unique_words, range(len(unique_words))))

document_list = []
for doc in document:
    temp = [word_dict[word] for word in doc]
    document_list.append(np.asarray(temp))
    
alpha = 0.02
beta=0.1
num_topics=10
num_epochs=250
num_top_words=5

# Random Initialization #
num_words = len(unique_words)
num_documents = len(document_list)

# topic-word distribution #
topic_word = np.zeros(shape=(num_topics, num_words))

# document-topic distribution #
document_topic = np.zeros(shape=(num_documents, num_topics))

topic = np.zeros(num_topics)
docs = np.zeros(num_documents)
topic_dict = defaultdict(int)

for i in range(0, num_documents):
    for j, word in enumerate(document_list[i]):
        z = np.random.randint(num_topics)
        document_topic[i, z] += 1
        docs[i] += 1
        topic_word[z, word] += 1
        topic[z] += 1
        topic_dict[(i, j)] = z
        
        
total_time_taken_start = time.time()
for epoch in range(num_epochs):
    start_time = time.time()
    for i in range(0, num_documents):
        for j, word in enumerate(document_list[i]):
            z = topic_dict[(i, j)]
            document_topic[i, z] -= 1
            docs[i] -= 1
            topic_word[z, word] -= 1
            topic[z] -= 1
            
            p_z = gibbs_sample(word, i, alpha, beta, num_topics, num_words, topic_word, topic, document_topic, docs)
            
            z = np.random.choice(np.arange(num_topics), p=p_z)
            topic_dict[(i, j)] = z
            document_topic[i, z] += 1
            docs[i] += 1
            topic_word[z, word] += 1
            topic[z] += 1

    print("Epoch: {}, time: {} s".format(epoch + 1, time.time() - start_time))

total_time_taken_end = time.time()
print("\n Total time taken (in mins) : ", (total_time_taken_end - total_time_taken_start)/60 )

id_word_dict = dict(zip(word_dict.values(), word_dict.keys()))

# Top words in a topic #
top_words_per_topic = []
for i in range(0, num_topics):
    topic_i = topic_word[i]
    top_words = np.argsort(topic_i)[::-1] [:num_top_words] # argsort in descending order
    temp = []
    for w in top_words:
        temp.append(id_word_dict[w])
    top_words_per_topic.append(temp)
    
output_file = "./outputs/topic_words_output_topic_10.txt"
file = open(output_file, 'w')
file.write("Total time taken (in mins) : " +str((total_time_taken_end - total_time_taken_start)/60)+"\n")
for i, words in enumerate(top_words_per_topic):
    file.write("Topic : " +str(i+1)+"\n")
    for w in words:
        file.write(w + "  ")
    file.write("\n\n")
file.close()

# Topic distribution in document #
top_distribution_across_document = []

for i in range(0, num_documents):
    document_i = document_topic[i]
    # Applying softmax function to get the probability distributions #
    softmax_out = softmax(document_i)
    top_distribution_across_document.append(softmax_out)
    
output_file = "./outputs/topic_document_distribution_output_topic_10.txt"

file = open(output_file, 'w')
for i, line in enumerate(top_distribution_across_document):
    file.write("Document : " +str(i+1)+"\n\n")
    for topic_num, t in enumerate(line):
        file.write("Topic "+str(topic_num) + ":\t"+str(t)+"\n")
    file.write("\n\n")
file.close()

Epoch: 1, time: 27.746720790863037 s
Epoch: 2, time: 28.352219820022583 s
Epoch: 3, time: 31.762882709503174 s
Epoch: 4, time: 29.403715133666992 s
Epoch: 5, time: 28.715131282806396 s
Epoch: 6, time: 28.381152629852295 s
Epoch: 7, time: 28.343016386032104 s
Epoch: 8, time: 28.38817834854126 s
Epoch: 9, time: 28.347832679748535 s
Epoch: 10, time: 29.105847120285034 s
Epoch: 11, time: 27.966973066329956 s
Epoch: 12, time: 28.13895606994629 s
Epoch: 13, time: 28.04894495010376 s
Epoch: 14, time: 27.757325887680054 s
Epoch: 15, time: 27.307960510253906 s
Epoch: 16, time: 30.315614700317383 s
Epoch: 17, time: 29.189720630645752 s
Epoch: 18, time: 28.254607439041138 s
Epoch: 19, time: 28.429627895355225 s
Epoch: 20, time: 27.783433198928833 s
Epoch: 21, time: 27.709852933883667 s
Epoch: 22, time: 27.730326175689697 s
Epoch: 23, time: 27.57526731491089 s
Epoch: 24, time: 28.22494602203369 s
Epoch: 25, time: 27.621623516082764 s
Epoch: 26, time: 28.21202254295349 s
Epoch: 27, time: 27.6568229

By analyzing the output we can see, the each topic has very closely related words ( for example: disney, animated, voice, animation - in topic 1, alien, effects, ship, aliens, special - in topic 5 and action, movie, jackie, bad, chan  - in topic 10)

In [None]:
# Trying on a different dataset - LDA on News paper dataset # 
# Using 20 newsgroups dataset provided by sklearn #

In [115]:
from sklearn.datasets import fetch_20newsgroups

cats = ['sci.space','sci.electronics'] # selecting data from only two categories
newsgroups_train = fetch_20newsgroups(subset='test', categories=cats)

In [116]:
# Number of documents in the dataset #
len(newsgroups_train['data'])

787

In [119]:
news_data = newsgroups_train['data']

In [133]:
# Preprocessing the data #

from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

def lemmatization(text):
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'  
        result.append(wordnet.lemmatize(token,pos))
    return result

en_stopwords = stopwords.words('english')

def preprocess(sentence):
    result = []
    for text in sentence:
        temp = []
        # remove url pattern
        url_pattern = re.compile(r'https?://\S+|www\.\S+') 
        text = url_pattern.sub(r'', text)
        
        # remove punctuations
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        text = tokenizer.tokenize(text)
        
        for token in text:
            # remove stop words
            if token not in en_stopwords:
                temp.append(token.lower())# change to lower case
                
        temp = lemmatization(temp) #lemmatization
        result.append(temp)
        
    return result

preprocessed_data = preprocess(news_data)

In [138]:
from nltk import FreqDist

def frequent_words(values):
    lst=[]
    for text in values:
        lst.extend(text)
    fdist=FreqDist(lst)
    return fdist.most_common(15)
freq_words = frequent_words(preprocessed_data)

In [139]:
lst = []
for a,b in freq_words:
    lst.append(b)

# removing frequent words #
def remove_freq_words(sentence):
    result = []
    for text in sentence:
        temp = []
        for item in text:
            if item not in lst:
                temp.append(item)
        result.append(temp)
    return result

preprocessed_data = remove_freq_words(preprocessed_data)

In [145]:
# LDA #
document = preprocessed_data.copy()

unique_words = []
for doc in document:
    unique_words.extend(doc)
    
unique_words = list(set(unique_words))
word_dict = dict(zip(unique_words, range(len(unique_words))))

document_list = []
for doc in document:
    temp = [word_dict[word] for word in doc]
    document_list.append(np.asarray(temp))
    
alpha = 0.02
beta=0.1
num_topics=3
num_epochs=200
num_top_words=5

# Random Initialization #

num_words = len(unique_words)
num_documents = len(document_list)

# topic-word distribution #
topic_word = np.zeros(shape=(num_topics, num_words))

# document-topic distribution #
document_topic = np.zeros(shape=(num_documents, num_topics))

topic = np.zeros(num_topics)
docs = np.zeros(num_documents)
topic_dict = defaultdict(int)

for i in range(0, num_documents):
    for j, word in enumerate(document_list[i]):
        z = np.random.randint(num_topics)
        document_topic[i, z] += 1
        docs[i] += 1
        topic_word[z, word] += 1
        topic[z] += 1
        topic_dict[(i, j)] = z
        
        
total_time_taken_start = time.time()
for epoch in range(num_epochs):
    start_time = time.time()
    for i in range(0, num_documents):
        for j, word in enumerate(document_list[i]):
            z = topic_dict[(i, j)]
            document_topic[i, z] -= 1
            docs[i] -= 1
            topic_word[z, word] -= 1
            topic[z] -= 1
            
            p_z = gibbs_sample(word, i, alpha, beta, num_topics, num_words, topic_word, topic, document_topic, docs)
            
            z = np.random.choice(np.arange(num_topics), p=p_z)
            topic_dict[(i, j)] = z
            document_topic[i, z] += 1
            docs[i] += 1
            topic_word[z, word] += 1
            topic[z] += 1

    print("Epoch: {}, time: {} s".format(epoch + 1, time.time() - start_time))

total_time_taken_end = time.time()
print("\n Total time taken (in mins) : ", (total_time_taken_end - total_time_taken_start)/60 )

id_word_dict = dict(zip(word_dict.values(), word_dict.keys()))

# Top words in a topic #
top_words_per_topic = []
for i in range(0, num_topics):
    topic_i = topic_word[i]
    top_words = np.argsort(topic_i)[::-1] [:num_top_words] # argsort in descending order
    temp = []
    for w in top_words:
        temp.append(id_word_dict[w])
    top_words_per_topic.append(temp)
    
output_file = "./outputs/topic_words_output_news_data.txt"
file = open(output_file, 'w')
file.write("Total time taken (in mins) : " +str((total_time_taken_end - total_time_taken_start)/60)+"\n")
for i, words in enumerate(top_words_per_topic):
    file.write("Topic : " +str(i+1)+"\n")
    for w in words:
        file.write(w + "  ")
    file.write("\n\n")
file.close()

# Topic distribution in document #
top_distribution_across_document = []

for i in range(0, num_documents):
    document_i = document_topic[i]
    # Applying softmax function to get the probability distributions #
    softmax_out = softmax(document_i)
    top_distribution_across_document.append(softmax_out)
    
output_file = "./outputs/topic_document_distribution_output_news_data.txt"

file = open(output_file, 'w')
for i, line in enumerate(top_distribution_across_document):
    file.write("Document : " +str(i+1)+"\n\n")
    for topic_num, t in enumerate(line):
        file.write("Topic "+str(topic_num) + ":\t"+str(t)+"\n")
    file.write("\n\n")
file.close()

Epoch: 1, time: 5.392460823059082 s
Epoch: 2, time: 5.413684844970703 s
Epoch: 3, time: 5.5462000370025635 s
Epoch: 4, time: 5.592286109924316 s
Epoch: 5, time: 5.408380746841431 s
Epoch: 6, time: 5.3268866539001465 s
Epoch: 7, time: 5.366381883621216 s
Epoch: 8, time: 5.31929612159729 s
Epoch: 9, time: 5.357953310012817 s
Epoch: 10, time: 5.473526954650879 s
Epoch: 11, time: 6.24548864364624 s
Epoch: 12, time: 5.409957647323608 s
Epoch: 13, time: 5.43842339515686 s
Epoch: 14, time: 5.385294437408447 s
Epoch: 15, time: 5.448282718658447 s
Epoch: 16, time: 5.524552822113037 s
Epoch: 17, time: 5.379626750946045 s
Epoch: 18, time: 5.391550779342651 s
Epoch: 19, time: 5.488931179046631 s
Epoch: 20, time: 5.393146753311157 s
Epoch: 21, time: 5.462009906768799 s
Epoch: 22, time: 5.448183059692383 s
Epoch: 23, time: 5.3809967041015625 s
Epoch: 24, time: 5.399440288543701 s
Epoch: 25, time: 5.480351448059082 s
Epoch: 26, time: 5.3790106773376465 s
Epoch: 27, time: 5.518793344497681 s
Epoch: 28

Here, the words are not completely related with each other. The words like i, edu occurs in all the topics, the and space occurs in both topic 1 and 2, and word line occurs in both topic 1 and 3. Varying the hyperparameter values and removing words like 'i' in the preprocessing steps, might give some good results.