# Topic Modeling and Latent Dirichlet Allocation (LDA) -
## ADA Project Milestone 2

Trying out the following tutorial on the quotebank2016 data set.
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24


In [94]:
import pandas as pd
import numpy as np
import gensim
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim import corpora, models
from pprint import pprint


## Part 1: Get & Parse the Data

In [207]:
# TESTING DATA

# Read the first 10000 quotations and store them in a datafram
with pd.read_json('Quotebank/quotes-2016.json.bz2', lines=True, compression='bz2', chunksize=1000000) as df_reader:
    i = 0
    for chunk in df_reader:
        print(f"Chunk: {i}")
        test_df = chunk
        i += 1
        break
        
print(f'Processing chunk with {len(test_df)} rows')
data_quotes = test_df[['quotation']]
print(data_quotes)



Chunk: 0
Processing chunk with 1000000 rows
                                                quotation
0       [ ] and Chris [ Jones ] were in there a lot an...
1       [ And ] I don't know if we have enough time to...
2       ... I feel like I was champion long before I l...
3       [ I ] mmigration has been and continues to be ...
4       [ It is ] the process of understanding what ki...
...                                                   ...
999995  The views from the existing restaurant here ar...
999996  The Viking way has been working. It's a team t...
999997  the vivid combination of the intriguing, the s...
999998  The Vuelta is a race I really enjoy and one wh...
999999     the way he kept bringing it back to Americans.

[1000000 rows x 1 columns]


In [170]:
# TRAINING DATA

data = pd.read_csv('archive.zip', error_bad_lines=False, compression='zip')
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text
print(documents.head())



  exec(code_obj, self.user_global_ns, self.user_ns)


                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


## Part 2: Tokenization & Lemmatization

In [208]:
np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/simonspangenberg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Examples

In [209]:
# Lemmatize: Lemmatization technique is like stemming. The output we will get after lemmatization is called 
# ‘lemma’, which is a root word rather than root stem, the output of stemming. After lemmatization, we will 
# be getting a valid word that means the same thing.
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [210]:
# Stemmatize: Stemming is a technique used to extract the base form of the words by removing affixes from them. 
# It is just like cutting down the branches of a tree to its stems. 
# For example, the stem of the words eating, eats, eaten is eat.
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [211]:
# Functions we will use
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [212]:
sample = data_quotes[data_quotes.index == 4310].values[0][0]

print('original quotation: ')
words = []
for word in sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(sample))


original quotation: 
['so', 'far,', "they've", 'succeeded.']


 tokenized and lemmatized document: 
['succeed']


In [213]:
# Process all the quotations
processed_docs = data_quotes['quotation'].map(preprocess)
print(processed_docs.head(30))

0     [chris, jone, tamba, hali, get, good, push, ki...
1                         [know, time, sell, communiti]
2     [feel, like, champion, long, leav, cedar, shoa...
3                     [mmigrat, continu, good, economi]
4               [process, understand, kind, hitter, go]
5     [flexibl, term, pay, competit, medic, communit...
6           [leftist, reorgan, capit, accumul, reorgan]
7     [malia, know, go, plan, famili, feel, comfort,...
8     [monterey, support, public, transpar, scienc, ...
9               [silver, come, suit, felt, good, posit]
10    [fail, acknowledg, basic, biolog, differ, fact...
11                          [abort, pill, show, unwork]
12    [busi, schedul, excus, know, game, month, half...
13    [imperfect, report, thought, express, prophet,...
14                 [basic, tenet, fair, honesti, share]
15                                              [hesit]
16    [omen, carri, unwant, pregnanc, term, like, li...
17    [weber, kind, go, raquo, puck, daddi, hour

## Part 3: Create a Dictionary of Words & Filter

In [214]:
# Create a dictionary from ‘processed_docs’ df containing the number of times a word appears in the training set
count = 0
dictionary = gensim.corpora.Dictionary(processed_docs)
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
print(len(dictionary))

0 allow
1 chris
2 creat
3 get
4 good
5 guy
6 hali
7 jone
8 kind
9 pressur
10 push
110952


In [215]:
# Filter tokens that appear: >15 times, more than 0.5 documents. 
# Then keep only the most frequent 100000 tokens. 
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(3697, 1)]

In [216]:
bow_doc_100 = bow_corpus[100]
for i in range(len(bow_doc_100)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_100[i][0], 
                                                     dictionary[bow_doc_100[i][0]], 
                                                     bow_doc_100[i][1]))

Word 26 ("go") appears 1 time.
Word 366 ("money") appears 1 time.
Word 459 ("team") appears 1 time.
Word 600 ("claim") appears 1 time.
Word 601 ("clue") appears 1 time.
Word 602 ("collect") appears 1 time.
Word 603 ("forc") appears 1 time.
Word 604 ("identifi") appears 1 time.
Word 605 ("inspector") appears 1 time.
Word 606 ("ransom") appears 1 time.
Word 607 ("scooter") appears 1 time.
Word 608 ("special") appears 1 time.
Word 609 ("task") appears 1 time.
Word 610 ("victim") appears 1 time.
Word 611 ("vital") appears 1 time.


## Part 4: Create a TF-IDF Model

TF-IDF stands for Term Frequency Inverse Document Frequency of records. It can be defined as the calculation of how relevant a word in a series or corpus is to a text. The meaning increases proportionally to the number of times in the text a word appears but is compensated by the word frequency in the corpus (data-set)

In [217]:
tfidf = models.TfidfModel(bow_corpus)

In [218]:
corpus_tfidf = tfidf[bow_corpus]

In [219]:
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.2932805456631883),
 (1, 0.4179816619033531),
 (2, 0.27942714023179815),
 (3, 0.2373521144919461),
 (4, 0.18662502799451475),
 (5, 0.25757101461800186),
 (6, 0.4669767314584352),
 (7, 0.25032592036130136),
 (8, 0.3374073392312525),
 (9, 0.3303279030832013)]


## Part 5: LDA

Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

Topic modeling is a way of abstract modeling to discover the abstract ‘topics’ that occur in the collections of documents. The idea is that we will perform unsupervised classification on different documents, which find some natural groups in topics. We can answer the following question using topic modeling.

    What is the topic/main idea of the document?
    Given a document, can we find another document with a similar topic?
    How do topics field change over time?

Latent Dirichlet allocation is one of the most popular methods for performing topic modeling. Each document consists of various words and each topic can be associated with some words. The aim behind the LDA to find topics that the document belongs to, on the basis of words contains in it. It assumes that documents with similar topics will use a similar group of words. This enables the documents to map the probability distribution over latent topics and topics are probability distribution.

### Using Bag of Words

In [220]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [221]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.101*"year" + 0.020*"time" + 0.017*"month" + 0.016*"start" + 0.016*"week" + 0.014*"final" + 0.014*"second" + 0.013*"come" + 0.013*"day" + 0.012*"half"
Topic: 1 
Words: 0.013*"communiti" + 0.012*"work" + 0.011*"busi" + 0.011*"continu" + 0.010*"help" + 0.010*"need" + 0.009*"support" + 0.008*"govern" + 0.008*"develop" + 0.008*"school"
Topic: 2 
Words: 0.023*"state" + 0.017*"trump" + 0.015*"parti" + 0.015*"presid" + 0.015*"polit" + 0.014*"support" + 0.013*"elect" + 0.013*"countri" + 0.012*"vote" + 0.011*"american"
Topic: 3 
Words: 0.019*"chang" + 0.019*"world" + 0.014*"look" + 0.013*"forward" + 0.013*"women" + 0.011*"citi" + 0.010*"histori" + 0.009*"work" + 0.008*"event" + 0.008*"film"
Topic: 4 
Words: 0.013*"case" + 0.011*"offic" + 0.011*"care" + 0.010*"polic" + 0.009*"health" + 0.008*"public" + 0.008*"protect" + 0.008*"forc" + 0.008*"inform" + 0.008*"take"
Topic: 5 
Words: 0.025*"work" + 0.020*"money" + 0.016*"time" + 0.016*"hard" + 0.015*"hand" + 0.014*"go" + 0.012*"ge

### Using TF-IDF

In [222]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [223]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"squad" + 0.005*"struggl" + 0.005*"nice" + 0.004*"go" + 0.004*"like" + 0.004*"hand" + 0.004*"nervous" + 0.004*"christma" + 0.004*"light" + 0.004*"excus"
Topic: 1 Word: 0.017*"play" + 0.015*"go" + 0.014*"game" + 0.014*"think" + 0.014*"good" + 0.012*"know" + 0.011*"team" + 0.010*"like" + 0.010*"want" + 0.010*"thing"
Topic: 2 Word: 0.009*"peopl" + 0.008*"music" + 0.007*"stori" + 0.007*"like" + 0.007*"write" + 0.006*"film" + 0.006*"song" + 0.006*"tell" + 0.006*"think" + 0.006*"say"
Topic: 3 Word: 0.006*"go" + 0.005*"like" + 0.005*"peopl" + 0.005*"come" + 0.005*"walk" + 0.005*"time" + 0.005*"open" + 0.004*"think" + 0.004*"door" + 0.004*"road"
Topic: 4 Word: 0.014*"trump" + 0.011*"vote" + 0.009*"elect" + 0.009*"parti" + 0.008*"presid" + 0.007*"peopl" + 0.007*"donald" + 0.007*"clinton" + 0.006*"hillari" + 0.006*"state"
Topic: 5 Word: 0.007*"money" + 0.005*"peopl" + 0.005*"wear" + 0.005*"like" + 0.005*"go" + 0.004*"talk" + 0.004*"cancer" + 0.004*"think" + 0.004*"treat" + 0

## Part 6: Performance Evaluation

In [224]:
# Should be classified in a sports topic
print(processed_docs[:3])

0    [chris, jone, tamba, hali, get, good, push, ki...
1                        [know, time, sell, communiti]
2    [feel, like, champion, long, leav, cedar, shoa...
Name: quotation, dtype: object


### On LDA BOW Model

In [225]:
for index, score in sorted(lda_model[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7752967476844788	 
Topic: 0.038*"good" + 0.038*"play" + 0.026*"game" + 0.025*"team" + 0.022*"think" + 0.021*"go" + 0.015*"great" + 0.014*"player" + 0.013*"better" + 0.012*"come"

Score: 0.15195025503635406	 
Topic: 0.019*"chang" + 0.019*"world" + 0.014*"look" + 0.013*"forward" + 0.013*"women" + 0.011*"citi" + 0.010*"histori" + 0.009*"work" + 0.008*"event" + 0.008*"film"


### On LDA TF-IDF Model

In [226]:
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6041104197502136	 
Topic: 0.017*"play" + 0.015*"go" + 0.014*"game" + 0.014*"think" + 0.014*"good" + 0.012*"know" + 0.011*"team" + 0.010*"like" + 0.010*"want" + 0.010*"thing"

Score: 0.32311466336250305	 
Topic: 0.009*"peopl" + 0.008*"music" + 0.007*"stori" + 0.007*"like" + 0.007*"write" + 0.006*"film" + 0.006*"song" + 0.006*"tell" + 0.006*"think" + 0.006*"say"


## Part 7: Testing with Quotebank Data Set

In [None]:
quotebank_example = data_quotes['quotation'][12]
print(quotebank_example)

In [None]:
bow_vector = dictionary.doc2bow(preprocess(quotebank_example))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))