In [1]:
import re
from decouple import config
from pymongo import MongoClient
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF # Non-negative matrix factorization
from sklearn import metrics

In [2]:
connection_string = config('MONGO_CONNECTION_STRING')
client = MongoClient(connection_string)
db = client.db
coll = db['twitter']

In [3]:
pipeline = [
    {
        "$project": {
            "_id": 0,
            "uid": "$user.id",
            "user": "$user.name",
            "screen_name": "$user.screen_name",
            "user_desc": "$user.description",
            "verified": "$user.verified",
            "followers": "$user.followers_count"
        }
    },
    {
        "$sort": {
            "user": 1
        }
    }
]
# sorting by uid will implicitly filter by date joined

In [4]:
df = pd.DataFrame.from_dict(list(coll.aggregate(pipeline)))
df.drop_duplicates(subset=['uid'], keep='first', inplace=True)
df.head()

Unnamed: 0,uid,user,screen_name,user_desc,verified,followers
0,807034969902252032,"""MRH_1984"" 🇨🇦",MRH_1984,🇨🇦 Not Politically Correct. Libtards=Mute/Bloc...,False,2480
1,942083635808792576,#AnitasAffordableBookstore,AABookstore,https://t.co/ATupVUkbR4… Amazon https://t.co/m...,False,529
13,1207754603673972736,#BCPoliTalk,bcpolitalk,We talk politics in British Columbia. Hosted b...,False,206
15,2518034430,#FreeChelseaManning & #PrayForAmazonas,JJacobMarion,"Aging, pseudo-intellectual twink with abandonm...",False,163
16,38458897,#IndianStatus531,BlueCedarAngel,Gitxsan Nation; FireWeed tribe; House of Woo's...,False,1228


In [6]:
corpus = list(df['user_desc'])

In [7]:
# Remove URL junk
url_pattern = re.compile("http[^\s]+", re.I)

for i in range(len(corpus)):
    corpus[i] = url_pattern.sub('', corpus[i])

In [8]:
corpus[:5]

['🇨🇦 Not Politically Correct. Libtards=Mute/Block\n "Orwell\'s 1984 is Upon Us"....',
 ' Amazon  Smashwords  @LlewelynPritch   ...Twitter',
 "We talk politics in British Columbia. Hosted by @billtieleman and Daniel Fontaine. #bcpoli #BCPoliTalk\n\nWe're a Partnered Program of @CTMinBC.",
 'Aging, pseudo-intellectual twink with abandonment issues. Failed Youtuber. Author. Patrons get my books for free! 🏳️\u200d🌈🇨🇦 (He/Him/His)',
 "Gitxsan Nation; FireWeed tribe; House of Woo'sim'la’ha’; Crest 2 baby white owls; #LawsOfMatriach\nHuman in a woman's body"]

In [9]:
non_ascii_pattern = re.compile("[^\u0000-\u2300]") # source https://stackoverflow.com/questions/2124010/grep-regex-to-match-non-ascii-characters
# ellipsis: "\u2026"; currency symbols later

for i in range(len(corpus)):
    corpus[i] = non_ascii_pattern.sub('', corpus[i])

In [10]:
# from tutorial
def print_top_words(model, feature_names, n_top_words):
    for i, topic in enumerate(model.components_):
        message = "Topic #%d: " % i
        message += "   ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        print()
    print()

In [11]:
N_TOPICS = 5

In [13]:
# Text frequency*Inverse document frequency matrix
# max_df: ignore terms with frequency higher than
# lowercase = True by default
tf_matrix = TfidfVectorizer(strip_accents='unicode',
                          analyzer='word',
                          stop_words='english',
                          ngram_range=(1,1))
M = tf_matrix.fit_transform(corpus)

In [14]:
n_grams = tf_matrix.get_feature_names()
print("TfIdf 1-grams:", len(n_grams))

TfIdf 1-grams: 7483


In [15]:
# Use raw counts (not TfIdf) for LDA
count_vect = CountVectorizer(strip_accents='unicode',
                             analyzer='word',
                             stop_words='english',
                             ngram_range=(1,1))
counts = count_vect.fit_transform(corpus)

In [16]:
raw_n_grams = count_vect.get_feature_names()

In [17]:
nmf = NMF(n_components=N_TOPICS, solver='mu')

In [18]:
W = nmf.fit_transform(M)

In [19]:
print_top_words(nmf, n_grams, 30)

Topic #0: justice   climate   social   environmental   activist   rights   advocate   change   writer   human   anti   director   settler   community   organizer   love   work   people   world   views   earth   based   environment   author   music   student   energy   fighting   policy   sovereignty

Topic #1: news   breaking   source   analysis   cbc   ca   vancouver   canada   ctv   stories   online   national   radio   local   city   reporter   account   politics   toronto   official   follow   latest   calgary   story   information   global   sports   email   world   cbcnews

Topic #2: canadian   proud   canada   political   mom   father   conservative   opinions   politics   wife   commentary   public   endorsements   husband   guy   junkie   party   hoser   canuck   fan   policy   life   media   time   bc   country   science   lover   member   cdnpoli

Topic #3: indigenous   people   relations   settler   issues   canada   resistance   student   island   territory   independent  

In [35]:
# Higher score, lower perplexity best https://cfss.uchicago.edu/notes/topic-modeling/#perplexity
components = [3,5,7]
decays = [0.5, 0.7, 0.9]
learning_offsets = [3,6,10]

In [36]:
for c in components:
    for d in decays:
        for lo in learning_offsets:
            print("Topics: %d\tDecay: %f\tOffset: %f" % (c, d, lo))
            lda = LatentDirichletAllocation(n_components=c,
                                            learning_decay=d,
                                            learning_method='online',
                                            random_state=0)
            L = lda.fit_transform(counts)
            print("Score: %d" % lda.score(counts))
            print("Perplexity: %d" % lda.perplexity(counts))
            print()

Topics: 3	Decay: 0.500000	Offset: 3.000000
Score: -167379
Perplexity: 6219

Topics: 3	Decay: 0.500000	Offset: 6.000000
Score: -167379
Perplexity: 6219

Topics: 3	Decay: 0.500000	Offset: 10.000000
Score: -167379
Perplexity: 6219

Topics: 3	Decay: 0.700000	Offset: 3.000000
Score: -166772
Perplexity: 6025

Topics: 3	Decay: 0.700000	Offset: 6.000000
Score: -166772
Perplexity: 6025

Topics: 3	Decay: 0.700000	Offset: 10.000000
Score: -166772
Perplexity: 6025

Topics: 3	Decay: 0.900000	Offset: 3.000000
Score: -166738
Perplexity: 6014

Topics: 3	Decay: 0.900000	Offset: 6.000000
Score: -166738
Perplexity: 6014

Topics: 3	Decay: 0.900000	Offset: 10.000000
Score: -166738
Perplexity: 6014

Topics: 5	Decay: 0.500000	Offset: 3.000000
Score: -170588
Perplexity: 7353

Topics: 5	Decay: 0.500000	Offset: 6.000000
Score: -170588
Perplexity: 7353

Topics: 5	Decay: 0.500000	Offset: 10.000000
Score: -170588
Perplexity: 7353

Topics: 5	Decay: 0.700000	Offset: 3.000000
Score: -169019
Perplexity: 6775

Topics: 

In [37]:
lda = LatentDirichletAllocation(n_components=3,
                                learning_decay=0.7,
                                learning_offset=10,
                                learning_method='online')

In [38]:
L = lda.fit_transform(counts)

In [40]:
print_top_words(lda, raw_n_grams, 40)

Topic #0: tweets   author   writer   university   like   live   bc   editor   life   teacher   research   father   poet   nation   things   prof   people   host   award   canadian   husband   junkie   retired   mother   british   consultant   working   proud   founder   chair   pronouns   member   matter   issues   views   freelance   lover   community   politics   metis

Topic #1: justice   climate   indigenous   social   activist   anti   love   rights   settler   environmental   canadian   people   feminist   student   proud   human   mom   advocate   writer   political   work   opinions   music   lover   make   good   world   change   canada   based   earth   public   non   health   policy   art   politics   artist   law   science

Topic #2: news   canada   media   com   reporter   world   stories   canadian   global   new   et   ca   politics   follow   independent   wetsuwetenstrong   cbc   en   free   truth   journalist   indigenous   account   email   gmail   online   ontario  

In [41]:
lda.perplexity(counts)

5939.292145553858