In [2]:
import re
from collections import defaultdict
from pprint import pprint
import pandas as pd
import sklearn
import gensim
import pyLDAvis

In [3]:
# Tutorial: https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#sphx-glr-auto-examples-core-run-corpora-and-vector-spaces-py

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from gensim import corpora
from gensim import models
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords

from pyLDAvis import gensim

In [4]:
df = pd.read_csv('aggregation.csv')
df.head()

Unnamed: 0,text,retweet_count,favorite_count,username,userdesc,verified,followers,created_at,has_urls,has_mentions,high_response
0,They are starting to get more and more despera...,24298,168648,GretaThunberg,17 year old climate and environmental activist...,True,4079169,2020-02-29 15:26:10,True,False,True
1,I do not believe we will defeat Donald Trump w...,18592,77895,BernieSanders,U.S. Senator from Vermont and candidate for Pr...,True,10951634,2020-03-02 20:30:56,False,False,True
2,Indigenous rights = Climate justice\n#Wetsuwet...,4609,21488,GretaThunberg,17 year old climate and environmental activist...,True,4086646,2020-02-08 13:36:48,True,False,True
3,Stop running away from your problem. Run into ...,2739,16317,pulte,The Philanthropist. Inventor of Twitter Philan...,True,2059165,2020-02-29 21:19:22,True,False,True
4,Support the Wet’suwet’en Nation and the pipeli...,2972,10035,GretaThunberg,17 year old climate and environmental activist...,True,4091979,2020-02-18 10:13:02,True,False,True


In [6]:
docs = list(df['text'])

In [7]:
# Remove URL junk
url_pattern = re.compile("http[^\s]+", re.I)

  url_pattern = re.compile("http[^\s]+", re.I)


In [8]:
# More restrictive punctuation match
non_ascii_pattern = re.compile("[^\u0000-\u007A]")

In [9]:
preprocess_filters = [lambda x: x.lower(), strip_punctuation, remove_stopwords]

In [10]:
for i in range(len(docs)):
    docs[i] = url_pattern.sub('', docs[i])
    docs[i] = non_ascii_pattern.sub('', docs[i])
    docs[i] = preprocess_string(docs[i], preprocess_filters)

In [11]:
docs[:5]

[['starting', 'desperate', 'shows', 'winning'],
 ['believe',
  'defeat',
  'donald',
  'trump',
  'candidate',
  'like',
  'joe',
  'biden',
  'supported',
  'iraq',
  'war'],
 ['indigenous',
  'rights',
  'climate',
  'justice',
  'wetsuwetenstrong',
  'keepitintheground'],
 ['stop',
  'running',
  'away',
  'problem',
  'run',
  'problem',
  'suck',
  'suck',
  'pop'],
 ['support',
  'wetsuweten',
  'nation',
  'pipeline',
  'protests',
  'happening',
  'canada',
  'wetsuwenstrong']]

In [12]:
add_stop_words = ['wetsuweten', 'wet', 'suwet', 'en',
                  'wetsuwetenstrong', 'wetsuwetensolidarity', 'shutdowncanada',
                  'bc', 'british', 'columbia', 'canada', 'indigenous', 'pipeline',
                  'hereditary', 'chiefs',
                 's', 'd', 'll', 've']

custom_stop_words = set(ENGLISH_STOP_WORDS).union(set(add_stop_words))

In [13]:
texts = [[word for word in document 
              if word not in custom_stop_words]
          for document in docs]

# remove words that appear infrequently
frequency = defaultdict(int)
for doc in texts:
    for t in doc:
        frequency[t] += 1

texts = [
    [t for t in doc if frequency[t] > 3]
    for doc in texts
]

pprint(texts[:5])

[['starting', 'shows'],
 ['believe', 'trump', 'like', 'joe', 'supported', 'war'],
 ['rights', 'climate', 'justice'],
 ['stop', 'running', 'away', 'problem', 'run', 'problem', 'suck', 'suck'],
 ['support', 'nation', 'protests', 'happening']]


In [14]:
dictionary = corpora.Dictionary(texts)
dictionary.save('tweets.dict')

In [15]:
corpus = [dictionary.doc2bow(text) for text in texts] # bag-of-words
corpora.MmCorpus.serialize('corpus.mm', corpus) # Matrix Market format

In [16]:
tfidf = models.TfidfModel(corpus)

In [17]:
corpus_tfidf = tfidf[corpus]

In [18]:
for i in range(2):
    print(corpus_tfidf[i])

[(0, 0.6655434613708038), (1, 0.746359096565835)]
[(2, 0.341641971009591), (3, 0.4556957719119699), (4, 0.2707662172220585), (5, 0.4159428397188536), (6, 0.47078731956761827), (7, 0.4556957719119699)]


In [19]:
NUM_TOPICS = 10

In [20]:
# https://radimrehurek.com/gensim/models/ldamodel.html#usage-examples
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS)

In [21]:
lda.top_topics(corpus=corpus, dictionary=dictionary) # (Probability, word) Coherence score

[([(0.021002978, 'cdnpoli'),
   (0.01826894, 'c'),
   (0.017946737, 'people'),
   (0.01731633, 'rcmp'),
   (0.016082782, 'blockades'),
   (0.015850948, 'land'),
   (0.013375814, 'pro'),
   (0.012665336, 'bcpoli'),
   (0.011052729, 'support'),
   (0.0105227735, 't'),
   (0.010376567, 'media'),
   (0.009849693, 'let'),
   (0.009482365, 'letter'),
   (0.009178472, 'mikefarnworthbc'),
   (0.009146291, 'racist'),
   (0.00909171, 'know'),
   (0.009075, 'b'),
   (0.008859755, 'funding'),
   (0.0084663015, 'amp'),
   (0.008245632, 'news')],
  -10.655432813040731),
 ([(0.027480649, 'justintrudeau'),
   (0.026037702, 'giving'),
   (0.022257678, 'hey'),
   (0.02187131, 'millions'),
   (0.021389194, 'coastalgaslink'),
   (0.01797731, 'calling'),
   (0.017285813, 'exportdevcanada'),
   (0.01728576, 'mary'),
   (0.01728575, 'ng'),
   (0.017158432, '000'),
   (0.017030157, '34'),
   (0.017030135, 'reject'),
   (0.013765385, 'government'),
   (0.012276152, 'm'),
   (0.010345474, 'rcmp'),
   (0.0099234

In [22]:
dictionary.token2id['coronavirus']

37

In [23]:
lda.get_term_topics(dictionary.token2id['coronavirus'])

[(0, 0.011314897)]

In [24]:
lda.get_term_topics(dictionary.token2id['bcpoli'])

[(8, 0.01715834), (9, 0.012481387)]

In [26]:
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [27]:
pyLDAvis.show(vis)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [30/Mar/2020 15:49:27] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [30/Mar/2020 15:49:27] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [30/Mar/2020 15:49:27] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [30/Mar/2020 15:49:27] code 404, message Not Found
127.0.0.1 - - [30/Mar/2020 15:49:27] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [30/Mar/2020 15:49:27] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...
