# Topic Modeling with LDA in Python

In [None]:
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb


# 1 Clean the documents
# 2 Create BoW
# 3 Calculate TF-IDF
# 4 Build LDA on top of documents represented as BoW    vectors
# 5 Build LDA on top of documents represented as TF-IDF vectors


In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)
from pprint import pprint

import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS


import nltk
# to update the package
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [None]:
def load_dfs(word_list, date):
    df = pd.DataFrame()
    for word in word_list:
        dfaux = pd.read_csv( date +'\/' + word + '.csv'     )
        dfaux['word'] = word
        df = df.append(dfaux)
    return df.reset_index(drop=True)


def lemmatize_stemming(text):
    # lemmatize
    # Stemm
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
date               = '2019-10-15'
example            = 3450
word_list          = ['furniture', 'homedecor', 'interiordesign']
df_tweets          = load_dfs(word_list, date).fillna(0)
df_tweets          = df_tweets[['tweet.text']]
df_tweets['index'] = df_tweets.index


print('Number of tweets:',len(df_tweets))
df_tweets.head()

In [None]:
# Tokenize, remove stopwords, remove short words, lemmatize, stemm

stemmer = SnowballStemmer('english')
processed_docs = df_tweets['tweet.text'].map(preprocess)


print('original document: ')
print(df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print('\n\n words in original document:')
words = []
for word in df_tweets[df_tweets['index'] == example].values[0][0].split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(processed_docs[example])

dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

print('\n\nDictionary:')
for k, v in dictionary.iteritems():
    if k <= 10:
        print(k, v)
    else:
        break

# Bag of words in the dataset

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print('\n\n original document: ')
print(df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print('\n\n Bag of words for example:')
for i in range(len(bow_corpus[example])):
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus[example][i][0], 
                                                     dictionary[bow_corpus[example][i][0]], 
                                                     bow_corpus[example][i][1]))

# TF-IDF

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

print('\n\n original document: ')
print(df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print('\n\n TF-IDF vector for example:')
for i in range(len(corpus_tfidf[example])):
    print("Word {} (\"{}\") has weight {} ".format(corpus_tfidf[example][i][0], 
                                                     dictionary[corpus_tfidf[example][i][0]], 
                                                     corpus_tfidf[example][i][1]))

# Topic classification

In [None]:
n_topics = 5

lda_model       = gensim.models.LdaMulticore(bow_corpus  , num_topics=n_topics, id2word=dictionary, passes=2, workers=4)
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=n_topics, id2word=dictionary, passes=2, workers=4)

# Results for LDA using Bag of Words

In [None]:
print('LIST OF TOPICS')
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Words: {}'.format(idx, topic))
    print()


print('\n\nEXAMPLE')
print('original document: ')
print(df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print()
for index, score in sorted(lda_model[bow_corpus[example]], key=lambda tup: -1*tup[1]):
    print("Score: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, n_topics)))

# Results for LDA using TF-IDF

In [None]:
print('LIST OF TOPICS')
for index, topic in lda_model_tfidf.print_topics(-1):
    print("Score: {}\t Topic: {}".format(index, topic))
    print()

print('\n\nEXAMPLE')
print('original document: ')
print(df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print()
for index, score in sorted(lda_model_tfidf[bow_corpus[example]], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, n_topics)))

# Testing models on unseen document

In [None]:
unseen_document = 'Just bought a new lamp for my living room, great design!'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

print('LDA on BoW')
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, n_topics)))

print('\n\nLDA on TF-IDF')
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, n_topics)))