# Topic detection

Dataset used: quotes from the *New York Times* from 2019.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyLDAvis
import pyLDAvis.gensim_models

from src.data_loader import download_test_data
from src.df_factory import (add_col_qid, create_df_joined_quotes,
                            create_df_speaker, create_df_test)
from src.sentiment_analysis import add_col_compound_score
from src.text_processing import (add_bigrams, add_col_bow, add_col_tokens,
                                 create_dictionary, get_lda_model)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/quentin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  from ._conv import register_converters as _register_converters
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [3]:
# Download data
download_test_data()

Filename quotes-2019-nytimes.json.bz2 already exists


In [4]:
# Create the test dataframe
df = create_df_test()

# Add QID column
add_col_qid(df)

df

100%|██████████| 207527/207527 [00:00<00:00, 1412395.87it/s]


Unnamed: 0_level_0,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,qid
quoteID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-17-024782,"It is not a low-income immigration,",James Fisher,"[Q16213953, Q20707104, Q43143598, Q58886302, Q...",2019-04-17 13:31:18,1,"[[James Fisher, 0.7475], [None, 0.2525]]",[https://www.nytimes.com/2019/04/17/realestate...,E,Q16213953
2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,[Q105756],2019-04-02 14:58:33,2,"[[John Updike, 0.5856], [None, 0.4144]]",[https://www.nytimes.com/2019/04/02/opinion/vl...,E,Q105756
2019-05-09-055187,It makes it much more difficult for him to mak...,,[],2019-05-09 18:11:29,1,"[[None, 0.6493], [President Bill Clinton, 0.27...",[http://mobile.nytimes.com/2019/05/09/world/as...,E,
2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,[Q1855840],2019-10-31 16:45:15,3,"[[Xavier Becerra, 0.9065], [None, 0.0909], [St...",[http://www.nytimes.com/2019/10/31/technology/...,E,Q1855840
2019-01-04-001792,A Pile of Leaves.,,[],2019-01-04 10:00:07,1,"[[None, 0.8737], [Jason Fulford, 0.1263]]",[https://www.nytimes.com/2019/01/04/books/revi...,E,
...,...,...,...,...,...,...,...,...,...
2019-07-11-031126,I put in one of those wire shelves that you bu...,,[],2019-07-11 04:00:15,1,"[[None, 0.8794], [Marissa Paternoster, 0.1206]]",[https://www.nytimes.com/2019/07/11/style/band...,E,
2019-06-25-032409,I used to go there religiously and buy anythin...,Carlos Santana,"[Q2367902, Q5042623, Q819016]",2019-06-25 19:52:47,2,"[[Carlos Santana, 0.9484], [None, 0.04], [Rick...",[https://www.nytimes.com/2019/06/25/arts/music...,E,Q2367902
2019-05-02-043481,I was the welterweight boxing champ at St. Mik...,Leonard Patrick Kelly,[Q7143620],2019-05-02 21:41:51,3,"[[Leonard Patrick Kelly, 0.5743], [None, 0.388...",[http://nytimes.com/2019/05/02/obituaries/red-...,E,Q7143620
2019-01-19-026354,In terms of changing Taiwan's situation for th...,,[],2019-01-19 09:31:37,1,"[[None, 0.7822], [Tsai Ing-wen, 0.1841], [Xi J...",[https://www.nytimes.com/2019/01/19/world/asia...,E,


In [5]:
# QIDS from Wikidata
qids = {
    'Donald Trump': 'Q22686',
    'Joe Biden': 'Q6279',
    'Barack Obama': 'Q76',
    'Hillary Clinton': 'Q6294',
}

# Create Trump's dataframe
df_trump = create_df_speaker(df, qids['Donald Trump'])
df_trump

Unnamed: 0_level_0,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,qid
quoteID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-02-15-009152,Asylum seekers in Mexico face a heightened ris...,President Donald Trump,[Q22686],2019-02-15 03:20:00,5,"[[President Donald Trump, 0.7752], [None, 0.22...",[http://www.abajournal.com/news/article/lawsui...,E,Q22686
2019-05-13-016076,"Don't forget they're a member of NATO, and a v...",President Donald Trump,[Q22686],2019-05-13 00:00:00,68,"[[President Donald Trump, 0.6683], [None, 0.23...",[http://kazu.org/post/trump-greets-hungarys-ha...,E,Q22686
2019-08-25-016871,I think it's a positive. Other people agree wi...,President Donald Trump,[Q22686],2019-08-25 23:19:22,7,"[[President Donald Trump, 0.6994], [None, 0.15...",[http://www.nytimes.com/2019/08/25/world/europ...,E,Q22686
2019-08-31-024043,"In other words, they're running badly and they...",President Donald Trump,[Q22686],2019-08-31 00:21:27,2,"[[President Donald Trump, 0.7363], [None, 0.26...",[http://www.nytimes.com/2019/08/30/us/politics...,E,Q22686
2019-05-09-051472,"In typical fashion, as soon as Trump Park was ...",President Trump,[Q22686],2019-05-09 09:25:21,1,"[[President Trump, 0.6722], [None, 0.3037], [D...",[https://www.nytimes.com/2019/05/09/nyregion/n...,E,Q22686
...,...,...,...,...,...,...,...,...,...
2019-11-19-032738,"I had a feeling that it was coming because, yo...",President Trump,[Q22686],2019-11-19 22:54:47,1,"[[President Trump, 0.7344], [None, 0.2656]]",[http://nytimes.com/2019/11/19/us/navy-seals-e...,E,Q22686
2019-04-04-038327,"I have heard that she was nasty to me, but she...",President Donald Trump,[Q22686],2019-04-04 00:00:00,42,"[[President Donald Trump, 0.5646], [Donald Tru...",[http://feeds.foxnews.com/~r/foxnews/politics/...,E,Q22686
2019-02-06-052048,investments in the cutting-edge industries of ...,President Donald Trump,[Q22686],2019-02-06 22:34:01,40,"[[President Donald Trump, 0.5853], [None, 0.23...",[https://defence.pk/pdf/threads/trump-preparin...,E,Q22686
2019-02-14-083866,showing us tremendous respect.,President Trump,[Q22686],2019-02-14 07:47:21,3,"[[President Trump, 0.5657], [None, 0.3582], [L...",[https://www.actionforex.com/action-insight/ma...,E,Q22686


In [6]:
# Add compound score
add_col_compound_score(df_trump)
df_trump

100%|██████████| 4557/4557 [00:00<00:00, 8293.12it/s]


Unnamed: 0_level_0,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,qid,compound_score
quoteID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-02-15-009152,Asylum seekers in Mexico face a heightened ris...,President Donald Trump,[Q22686],2019-02-15 03:20:00,5,"[[President Donald Trump, 0.7752], [None, 0.22...",[http://www.abajournal.com/news/article/lawsui...,E,Q22686,-0.9300
2019-05-13-016076,"Don't forget they're a member of NATO, and a v...",President Donald Trump,[Q22686],2019-05-13 00:00:00,68,"[[President Donald Trump, 0.6683], [None, 0.23...",[http://kazu.org/post/trump-greets-hungarys-ha...,E,Q22686,0.5939
2019-08-25-016871,I think it's a positive. Other people agree wi...,President Donald Trump,[Q22686],2019-08-25 23:19:22,7,"[[President Donald Trump, 0.6994], [None, 0.15...",[http://www.nytimes.com/2019/08/25/world/europ...,E,Q22686,0.8225
2019-08-31-024043,"In other words, they're running badly and they...",President Donald Trump,[Q22686],2019-08-31 00:21:27,2,"[[President Donald Trump, 0.7363], [None, 0.26...",[http://www.nytimes.com/2019/08/30/us/politics...,E,Q22686,-0.9100
2019-05-09-051472,"In typical fashion, as soon as Trump Park was ...",President Trump,[Q22686],2019-05-09 09:25:21,1,"[[President Trump, 0.6722], [None, 0.3037], [D...",[https://www.nytimes.com/2019/05/09/nyregion/n...,E,Q22686,-0.3400
...,...,...,...,...,...,...,...,...,...,...
2019-11-19-032738,"I had a feeling that it was coming because, yo...",President Trump,[Q22686],2019-11-19 22:54:47,1,"[[President Trump, 0.7344], [None, 0.2656]]",[http://nytimes.com/2019/11/19/us/navy-seals-e...,E,Q22686,0.1280
2019-04-04-038327,"I have heard that she was nasty to me, but she...",President Donald Trump,[Q22686],2019-04-04 00:00:00,42,"[[President Donald Trump, 0.5646], [Donald Tru...",[http://feeds.foxnews.com/~r/foxnews/politics/...,E,Q22686,-0.3182
2019-02-06-052048,investments in the cutting-edge industries of ...,President Donald Trump,[Q22686],2019-02-06 22:34:01,40,"[[President Donald Trump, 0.5853], [None, 0.23...",[https://defence.pk/pdf/threads/trump-preparin...,E,Q22686,0.0000
2019-02-14-083866,showing us tremendous respect.,President Trump,[Q22686],2019-02-14 07:47:21,3,"[[President Trump, 0.5657], [None, 0.3582], [L...",[https://www.actionforex.com/action-insight/ma...,E,Q22686,0.4767


In [7]:
# Create dataframe with joined quotes by speaker
df_joined_quotes = create_df_joined_quotes(df)
df_joined_quotes

100%|██████████| 26162/26162 [00:00<00:00, 86366.76it/s]


Unnamed: 0,qid,quotation
0,Q1000491,"No way will something like that happen again,"
1,Q100071,"It was like, `I did everything right, and I've..."
2,Q1000791,"I had 45 turkeys on my front lawn, I walked st..."
3,Q1000820,I use them to get my frustrations out by telli...
4,Q1001244,"Max, I'm going to stop it. Max, you're getting..."
...,...,...
26157,Q996054,"The one-man, one-gun, one-bullet is not what o..."
26158,Q997091,She was ready to move to a nice place that was...
26159,Q997312,private reserves in which thought has develope...
26160,Q997380,He was a hero-like horse for me. I was sincere...


In [8]:
# Select four politicians
df_politicians = df_joined_quotes[df_joined_quotes.qid.isin(qids.values())]
df_politicians

Unnamed: 0,qid,quotation
8874,Q22686,Asylum seekers in Mexico face a heightened ris...
22394,Q6279,"He's not a fossil fuel executive, It's disgust..."
22442,Q6294,What is it you're finding out? Where does it l...
24891,Q76,"Tell her to keep her mouth shut, Elizabeth was..."


In [9]:
# Lengths of quotations
df_politicians.quotation.apply(len)

8874     376056
22394     61688
22442      7239
24891     15955
Name: quotation, dtype: int64

In [10]:
# Tokenize
add_col_tokens(df_politicians)
df_politicians

100%|██████████| 4/4 [00:20<00:00,  5.14s/it]


Unnamed: 0,qid,quotation,tokens
8874,Q22686,Asylum seekers in Mexico face a heightened ris...,"[asylum, seeker, Mexico, face, heightened, ris..."
22394,Q6279,"He's not a fossil fuel executive, It's disgust...","[fossil, fuel, executive, disgusting, ask, que..."
22442,Q6294,What is it you're finding out? Where does it l...,"[find, lead, lead, conclusion, president, comm..."
24891,Q76,"Tell her to keep her mouth shut, Elizabeth was...","[tell, mouth, shut, Elizabeth, sound, alarm, p..."


In [11]:
# Lengths of list of tokens
df_politicians.tokens.apply(len)

8874     26676
22394     4258
22442      508
24891     1150
Name: tokens, dtype: int64

In [12]:
# Add bigrams
add_bigrams(df_politicians)
df_politicians

100%|██████████| 4/4 [00:00<00:00, 93.69it/s]


Unnamed: 0,qid,quotation,tokens
8874,Q22686,Asylum seekers in Mexico face a heightened ris...,"[asylum, seeker, Mexico, face, heightened, ris..."
22394,Q6279,"He's not a fossil fuel executive, It's disgust...","[fossil, fuel, executive, disgusting, ask, que..."
22442,Q6294,What is it you're finding out? Where does it l...,"[find, lead, lead, conclusion, president, comm..."
24891,Q76,"Tell her to keep her mouth shut, Elizabeth was...","[tell, mouth, shut, Elizabeth, sound, alarm, p..."


In [13]:
# Lengths of list of tokens
df_politicians.tokens.apply(len)

8874     27059
22394     4310
22442      514
24891     1152
Name: tokens, dtype: int64

In [14]:
# Create dictionary
dictionary = create_dictionary(df_politicians, min_wordcount=0, max_freq=1)
print(dictionary)

# Add Bag-of-words column
add_col_bow(df_politicians, dictionary)
df_politicians

Dictionary(5409 unique tokens: ['ABT', 'AIDS', 'ALABAMA', 'AMERICA', 'AMERICAN']...)


100%|██████████| 4/4 [00:00<00:00, 404.50it/s]


Unnamed: 0,qid,quotation,tokens,bow
8874,Q22686,Asylum seekers in Mexico face a heightened ris...,"[asylum, seeker, Mexico, face, heightened, ris...","[(0, 1), (1, 1), (2, 1), (3, 5), (4, 1), (5, 1..."
22394,Q6279,"He's not a fossil fuel executive, It's disgust...","[fossil, fuel, executive, disgusting, ask, que...","[(12, 4), (21, 1), (32, 3), (33, 17), (34, 1),..."
22442,Q6294,What is it you're finding out? Where does it l...,"[find, lead, lead, conclusion, president, comm...","[(33, 1), (117, 1), (134, 1), (173, 1), (174, ..."
24891,Q76,"Tell her to keep her mouth shut, Elizabeth was...","[tell, mouth, shut, Elizabeth, sound, alarm, p...","[(20, 1), (33, 1), (34, 3), (35, 1), (84, 2), ..."


In [15]:
# Create LDA model
lda_model = get_lda_model(df_politicians, dictionary)
lda_model.show_topics(num_topics=10, num_words=4)

[(75, '0.009*"people" + 0.008*"think" + 0.008*"want" + 0.007*"know"'),
 (48, '0.011*"think" + 0.011*"people" + 0.008*"country" + 0.007*"good"'),
 (24, '0.012*"know" + 0.009*"think" + 0.008*"people" + 0.008*"want"'),
 (84, '0.008*"people" + 0.008*"want" + 0.007*"like" + 0.007*"know"'),
 (47, '0.013*"people" + 0.011*"know" + 0.010*"like" + 0.010*"think"'),
 (27, '0.012*"people" + 0.010*"know" + 0.009*"country" + 0.009*"like"'),
 (60, '0.013*"people" + 0.011*"think" + 0.008*"like" + 0.008*"know"'),
 (65, '0.010*"people" + 0.009*"think" + 0.009*"know" + 0.008*"like"'),
 (72, '0.012*"know" + 0.011*"people" + 0.010*"want" + 0.010*"think"'),
 (73, '0.009*"people" + 0.008*"know" + 0.008*"like" + 0.007*"great"')]

In [16]:
# Display topics
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(
    topic_model=lda_model,
    corpus=df_politicians.bow,
    dictionary=dictionary,
)

  default_term_info = default_term_info.sort_values(
