# Topic Modeling Technique - Non-Negative Matrix Factorization (NMF)
## Konstantina Andronikou 

## This Notebook is an adaptation of the tutorial generated from Piek Vossen:
https://github.com/cltl/ba-text-mining/blob/master/lab_sessions/lab6/Lab6.2-Topic-modeling-gensim.ipynb

In [None]:
#Importing all relevant packages
import gensim
import nltk
import numpy as np
import pandas as pd
# nltk.download('wordnet')
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.models import Nmf
from nltk.stem.porter import *
np.random.seed(2018)

In [None]:
#Loading the pre-processed data generated from pre_processing.ipynb
documents = pd.read_csv('data/Input_for_topic_model.tsv', header=None, delimiter= '\t', encoding='latin1', dtype=str)

In [None]:
#Creating a dataframe from the data imported 
full_train = pd.DataFrame()
full_train['text'] = documents[0]
full_train['text'] = full_train['text'].fillna('').astype(str)
full_train.head()
documents = full_train

In [None]:
#Pre-processing steps 
def lemmatize_stemming(text):
    """ Lemmatizes the input text 
        Argument: text (this refers to the input file of the topic model)
    """
    lemmatizer = WordNetLemmatizer() #lemmatization 
    return lemmatizer.lemmatize(text)
def preprocess(text):
    """ Pre-processing the text  
        Argument: text (this refers to the input file of the topic model)
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: #removing stopwords 
            result.append(lemmatize_stemming(token))
    return result

In [None]:
#Pre-processing the data 
processed_docs = documents['text'].map(preprocess)

In [None]:
#Creating a dictionary containing the frequency of a word in the data
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    count += 1
    if count > 10:
        break

In [None]:
#Filtering out tokens that appear in less than 15 documents or more than 0.5 documents and store the first 100000 most frequent tokens.
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
# Dictionary created for each document contaning the total number and the frequency of the words.
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] # .doc2bow is to create a BoW vector representation 

## Training and implementing NMF with the following parameters: 
        1.bow_corpus = Corpus data as BoW
        2.id2word = Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for debugging and topic printing.
        3.passes = Number of full passes over the training corpus.
        4.num_topics = Number of topics to extract.
        5.minimum_probability = Topics with smaller probabilities are filtered out.
### For additional parameters, please look at: https://radimrehurek.com/gensim/models/nmf.html

In [None]:
 nmf = gensim.models.nmf.Nmf(bow_corpus,id2word=dictionary, passes=10, num_topics=10,minimum_probability = True)

# Results 

### The following visualazations of the model was adapted from Selva Prabhakaran:
 https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/


In [None]:
from collections import Counter
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
topics = nmf.show_topics(formatted=False, num_topics = 10)
data_flat = [w for w_list in bow_corpus for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(5, 2, figsize=(16,22), sharey=True, dpi=160) #setting the number of topics visualised  
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height=3000, data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.2); ax.set_ylim(0, 8500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'center')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=40, y=1.05)    
plt.show()

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
%matplotlib inline
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(background_color='white',
                  width=3000,
                  height=1900,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = nmf.show_topics(formatted=False)

fig, axes = plt.subplots(5, 2, figsize=(10,10), sharex=True, sharey=True) #setting the number of topics visualised 

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=600)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()


# Evaluation 
### The model was evaluated in terms of coherence score (c_v and u_mass) 


In [None]:
from gensim.models import CoherenceModel
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=nmf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=nmf, texts=processed_docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### The following cell presents pairwise cosine similarity. An overall score and word-level cosine similarity for all possible word combinations is presented in error_analysis.ipynb. 

In [None]:
from gensim.matutils import cossim
from gensim.matutils import cossim
doc1 = nmf.get_document_topics(bow_corpus[0], minimum_probability=0) #Topic 1
doc2 = nmf.get_document_topics(bow_corpus[1], minimum_probability=0) #Topic 2
print(cossim(doc1, doc2))

## End of Notebook