# Topic Modeling Technique - Latent Dirichlet Allocation (LDA)
## Konstantina Andronikou 

## This Notebook is an adaptation of the tutorial generated from Piek Vossen:
https://github.com/cltl/ba-text-mining/blob/master/lab_sessions/lab6/Lab6.2-Topic-modeling-gensim.ipynb

In [None]:
#Importing all relevant packages
import gensim
import numpy as np
import pandas as pd
import nltk
# nltk.download('wordnet')
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.matutils import cossim
from nltk.stem.porter import *

In [None]:
#Loading the pre-processed data generated from pre_processing.ipynb
documents = pd.read_csv('data/Input_for_topic_model.tsv', header=None, delimiter= '\t', encoding='latin1', dtype=str)

In [None]:
#Creating a dataframe from the data imported 
full_train = pd.DataFrame()
full_train['text'] = documents[0]
full_train['text'] = full_train['text'].fillna('').astype(str) #removing any nan type objects
full_train.head()
documents = full_train

In [None]:
# Small additional Pre-processing steps 
def lemmatize_stemming(text):
    """ Lemmatizes the input text 
        Argument: text (this refers to the input file of the topic model)
    """
    lemmatizer = WordNetLemmatizer() #lemmatixation
    return lemmatizer.lemmatize(text)
def preprocess(text):
    """ Pre-processing the input text 
        Argument: text (this refers to the input file of the topic model)
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: #removing stopwords 
           # result.append(token)
            result.append(lemmatize_stemming(token))
    return result

In [None]:
#Pre-processing the data 
processed_docs = documents['text'].map(preprocess)

In [None]:
#Creating a dictionary containing the frequency of a word in the data
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    count += 1
    if count > 10:
        break

In [None]:
#Filtering out tokens that appear in less than 15 documents or more than 0.5 documents and store the first 100000 most frequent tokens.
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
# Dictionary created for each document contaning the total number and the frequency of the words.
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]#.doc2bow is to create a BoW vector representation 

## Training and implementing LDA with the following parameters: 
        1.bow_corpus = Corpus data as BoW
        2.id2word = Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for debugging and topic printing.
        3.passes = Number of full passes over the training corpus.
        4.num_topics = Number of topics to extract.
        5.per_word_topics = the model also computes a list of topics, sorted in descending order of most likely topics for each word.
        6.alpha = Controls the prior distribution over topic weights across each document. 
        7.eta = Controls the prior distibution over word weights across each topic.
### For additional parameters, please look at: https://radimrehurek.com/gensim/models/ldamodel.html
### Additionally, this project executed a grid search in order to find the optimal parameters. Please find the required code at the end of this notebook .

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                       id2word=dictionary,
                                       num_topics=10, 
                                       passes=10,
                                       per_word_topics=True,alpha = 0.31, eta = 0.90)


# Results

### The following visualizations of the model were adapted from Selva Prabhakaran:
 https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/


In [None]:
import pyLDAvis.gensim
import pyLDAvis
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary) #giving the needed values to  generate the topics
LDAvis_prepared

In [None]:
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
from collections import Counter
%matplotlib inline
topics = lda_model.show_topics(formatted=False, num_topics = 10)
data_flat = [w for w_list in bow_corpus for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(5, 2, figsize=(16,15), sharey=True, dpi=160) #setting the number of topics visualised 
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height=3000, data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.2); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'center')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=40, y=1.05)    
plt.show()

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
# %matplotlib inline
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(background_color='white',
                  width=3000,
                  height=1900,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)
fig, axes = plt.subplots(5, 2, figsize=(10,10), sharex=True, sharey=True) #setting the number of topics visualised 

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=600)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()



# Evaluation 
### The model was evaluated in terms of coherence score (c_v and u_mass)


In [None]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### The following cell presents pairwise cosine similarity. An overall score and word-level cosine similarity for all possible word combinations is presented in error_analysis.ipynb. 

In [None]:
doc1 = lda_model.get_document_topics(bow_corpus[0], minimum_probability=0) #Topic 1
doc2 = lda_model.get_document_topics(bow_corpus[1], minimum_probability=0) #Topic 2
print(cossim(doc1, doc2))

## Optional: Grid Search for optimal parameters. 
### This code was retrived by https://stackoverflow.com/questions/67899082/i-have-this-code-for-lda-when-i-run-it-i-keep-getting-an-error-which-is-difficul on the 17/05/22

In [None]:
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(bow_corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(bow_corpus, num_of_docs*0.75), 
               bow_corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=5)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

## End of Notebook