In [1]:
import re
from collections import defaultdict
from pprint import pprint
from decouple import config
from pymongo import MongoClient
import numpy as np
import pandas as pd
import sklearn
import gensim
import pyLDAvis

In [21]:
# Tutorial: https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#sphx-glr-auto-examples-core-run-corpora-and-vector-spaces-py

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from gensim import corpora
from gensim import models
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords

from pyLDAvis import gensim

In [3]:
connection_string = config('MONGO_CONNECTION_STRING')
client = MongoClient(connection_string)
db = client.db
coll = db['twitter']

In [4]:
pipeline = [
    {
        "$project": {
            "_id": 0,
            "uid": "$user.id",
            "user": "$user.name",
            "screen_name": "$user.screen_name",
            "user_desc": "$user.description",
            "verified": "$user.verified",
            "followers": "$user.followers_count"
        }
    },
    {
        "$sort": {
            "user": 1
        }
    }
]

In [5]:
df = pd.DataFrame.from_dict(list(coll.aggregate(pipeline)))
df.drop_duplicates(subset=['uid'], keep='first', inplace=True)
df.head()

Unnamed: 0,uid,user,screen_name,user_desc,verified,followers
0,807034969902252032,"""MRH_1984"" 🇨🇦",MRH_1984,🇨🇦 Not Politically Correct. Libtards=Mute/Bloc...,False,2480
1,942083635808792576,#AnitasAffordableBookstore,AABookstore,https://t.co/ATupVUkbR4… Amazon https://t.co/m...,False,529
13,1207754603673972736,#BCPoliTalk,bcpolitalk,We talk politics in British Columbia. Hosted b...,False,206
15,2518034430,#FreeChelseaManning & #PrayForAmazonas,JJacobMarion,"Aging, pseudo-intellectual twink with abandonm...",False,163
16,38458897,#IndianStatus531,BlueCedarAngel,Gitxsan Nation; FireWeed tribe; House of Woo's...,False,1228


In [6]:
docs = list(df['user_desc'])

In [7]:
# Remove URL junk
url_pattern = re.compile("http[^\s]+", re.I)

  url_pattern = re.compile("http[^\s]+", re.I)


In [8]:
# More restrictive punctuation match
non_ascii_pattern = re.compile("[^\s\u0040-\u007A]")

  non_ascii_pattern = re.compile("[^\s\u0040-\u007A]")


In [9]:
# preprocess_filters = [lambda x: x.lower(), strip_punctuation, remove_stopwords]

In [10]:
for i in range(len(docs)):
    docs[i] = str(docs[i]).lower()
    docs[i] = url_pattern.sub('', docs[i])
    docs[i] = non_ascii_pattern.sub('', docs[i])

In [11]:
docs[:5]

[' not politically correct libtardsmuteblock\n orwells  is upon us',
 ' amazon  smashwords  @llewelynpritch   twitter',
 'we talk politics in british columbia hosted by @billtieleman and daniel fontaine bcpoli bcpolitalk\n\nwere a partnered program of @ctminbc',
 'aging pseudointellectual twink with abandonment issues failed youtuber author patrons get my books for free  hehimhis',
 'gitxsan nation fireweed tribe house of woosimlaha crest  baby white owls lawsofmatriach\nhuman in a womans body']

In [None]:
# add_stop_words = ['wetsuweten', 'wet', 'suwet', 'en',
#                   'wetsuwetenstrong', 'wetsuwetensolidarity', 'shutdowncanada',
#                   'bc', 'british', 'columbia', 'canada', 'indigenous', 'pipeline',
#                   'hereditary', 'chiefs',
#                  's', 'd', 'll', 've']

# custom_stop_words = set(ENGLISH_STOP_WORDS).union(set(add_stop_words))

In [12]:
texts = [[word for word in document.split() 
              if word not in ENGLISH_STOP_WORDS]
          for document in docs]

# remove words appear infrequently
frequency = defaultdict(int)
for doc in texts:
    for t in doc:
        frequency[t] += 1

texts = [
    [t for t in doc if frequency[t] > 4]
    for doc in texts
]

pprint(texts[:5])

[['politically', 'correct'],
 ['twitter'],
 ['talk', 'politics', 'british', 'columbia', 'hosted', 'bcpoli', 'program'],
 ['issues', 'author', 'books', 'free', 'hehimhis'],
 ['gitxsan', 'nation', 'house', 'white', 'human']]


In [13]:
dictionary = corpora.Dictionary(texts)
dictionary.save('users.dict')

In [14]:
corpus = [dictionary.doc2bow(text) for text in texts] # bag-of-words
corpora.MmCorpus.serialize('user_corpus.mm', corpus) # Matrix Market format

In [None]:
tfidf = models.TfidfModel(corpus)

In [None]:
corpus_tfidf = tfidf[corpus]

In [None]:
for i in range(2):
    print(corpus_tfidf[i])

In [None]:
# Find optimal number of topics
for n in range(2,11):
    print("%d topics" % n)
    lda_compare = models.LdaModel(corpus, id2word=dictionary, num_topics=n)
    cm = models.CoherenceModel(model=lda_compare, corpus=corpus, texts=texts)
    print(cm.get_coherence())
    print()

In [15]:
NUM_TOPICS = 10

In [16]:
# https://radimrehurek.com/gensim/models/ldamodel.html#usage-examples
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS)

In [17]:
lda.top_topics(corpus=corpus, dictionary=dictionary) # [((Probability, word)], coherence score)

[([(0.03395843, 'justice'),
   (0.033756755, 'social'),
   (0.020900713, 'canada'),
   (0.011132948, 'love'),
   (0.00959508, 'indigenous'),
   (0.009364389, 'canadian'),
   (0.008480331, 'environmental'),
   (0.008373523, 'working'),
   (0.008373392, 'political'),
   (0.0074535217, 'climate'),
   (0.0074530914, 'work'),
   (0.0074529108, 'activist'),
   (0.0069958176, 'writer'),
   (0.006763515, 'public'),
   (0.0065330574, 'british'),
   (0.006532996, 'pm'),
   (0.006532724, 'science'),
   (0.0060505704, 'global'),
   (0.0056589814, 'issues'),
   (0.0056130816, 'policy')],
  -8.087980900644935),
 ([(0.02939238, 'climate'),
   (0.012592556, 'student'),
   (0.012153367, 'people'),
   (0.01215283, 'change'),
   (0.011698941, 'world'),
   (0.011291542, 'new'),
   (0.009567473, 'activist'),
   (0.0095671, 'tweets'),
   (0.008705674, 'university'),
   (0.008705639, 'father'),
   (0.008705631, 'environmental'),
   (0.008705495, 'mom'),
   (0.008705338, 'writer'),
   (0.008300925, 'justice')

In [18]:
dictionary.token2id['student']

113

In [19]:
lda.get_term_topics(dictionary.token2id['student'])

[(1, 0.012169319)]

In [23]:
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [24]:
pyLDAvis.show(vis)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [30/Mar/2020 15:58:46] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [30/Mar/2020 15:58:46] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [30/Mar/2020 15:58:46] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [30/Mar/2020 15:58:46] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...
