In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
from sklearn.pipeline import Pipeline

# Tutorials:
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF # Non-negative matrix factorization

In [3]:
pd.set_option("display.max_colwidth", 100)

In [4]:
df = pd.read_csv('aggregation.csv')
df.head()

Unnamed: 0,text,retweet_count,favorite_count,username,userdesc,verified,followers,created_at,has_urls,has_mentions,high_response
0,They are starting to get more and more desperate...\nThis shows that we’re winning. https://t.co...,24298,168648,GretaThunberg,17 year old climate and environmental activist with Asperger’s #climatestrike #fridaysforfuture...,True,4079169,2020-02-29 15:26:10,True,False,True
1,I do not believe we will defeat Donald Trump with a candidate like Joe Biden who supported the I...,18592,77895,BernieSanders,U.S. Senator from Vermont and candidate for President of the United States.,True,10951634,2020-03-02 20:30:56,False,False,True
2,Indigenous rights = Climate justice\n#WetsuwetenStrong #KeepItInTheGround https://t.co/1kYNumyoQT,4609,21488,GretaThunberg,17 year old climate and environmental activist with Asperger’s #climatestrike #fridaysforfuture...,True,4086646,2020-02-08 13:36:48,True,False,True
3,"Stop running away from your problem. Run into your problem. It will suck. Really suck. But, wh...",2739,16317,pulte,"The Philanthropist. Inventor of Twitter Philanthropy. Giving Money, Food, and Rent To People In ...",True,2059165,2020-02-29 21:19:22,True,False,True
4,Support the Wet’suwet’en Nation and the pipeline protests happening now in Canada! #WetsuwenStro...,2972,10035,GretaThunberg,17 year old climate and environmental activist with Asperger’s #climatestrike #fridaysforfuture...,True,4091979,2020-02-18 10:13:02,True,False,True


In [5]:
corpus = list(df['text'])

In [6]:
# Remove URL junk
url_pattern = re.compile("http[^\s]+", re.I)

for i in range(len(corpus)):
    corpus[i] = url_pattern.sub('', corpus[i])

In [7]:
non_ascii_pattern = re.compile("[^\u0000-\u2300]")

In [8]:
# from tutorial
def print_top_words(model, feature_names, n_top_words):
    for i, topic in enumerate(model.components_):
        message = "Topic #%d: " % i
        message += "   ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        print()
    print()

In [9]:
N_TOPICS = 8

In [10]:
add_stop_words = ['wetsuweten', 'wet', 'suwet', 'en',
                  'wetsuwetenstrong', 'wetsuwetensolidarity', 'shutdowncanada',
                    'bc', 'british', 'columbia', 'canada', 'indigenous', 'pipeline',
                 'hereditary', 'chiefs']

custom_stop_words = set(ENGLISH_STOP_WORDS).union(set(add_stop_words))

In [11]:
# Text frequency*Inverse document frequency matrix
# max_df: ignore terms with frequency higher than
# lowercase = True by default
matrix = TfidfVectorizer(strip_accents='unicode',
                          analyzer='word',
                          stop_words=custom_stop_words,
                          ngram_range=(1,1))
M = matrix.fit_transform(corpus)

In [12]:
n_grams = matrix.get_feature_names()
print("TfIdf N-grams:", len(n_grams))

TfIdf N-grams: 8388


In [13]:
# Use raw counts (not TfIdf) for LDA
count_vect = CountVectorizer(strip_accents='unicode',
                             analyzer='word',
                             stop_words=custom_stop_words,
                             ngram_range=(1,1))
counts = count_vect.fit_transform(corpus)

In [14]:
raw_n_grams = count_vect.get_feature_names()

In [15]:
# n_components: if unset, all features used
# solver: multiplicative update optimization (used in topic modelling examples)
# https://mlexplained.com/2017/12/28/a-practical-introduction-to-nmf-nonnegative-matrix-factorization/
nmf = NMF(n_components=N_TOPICS, solver='mu')

In [16]:
W = nmf.fit_transform(M)

In [17]:
nmf.components_

array([[7.50729061e-159, 1.94264231e-100, 6.06853975e-004, ...,
        9.27147427e-050, 1.00633800e-163, 4.01916292e-032],
       [1.74806424e-020, 8.28171486e-001, 2.47926854e-021, ...,
        1.48859078e-043, 2.22010941e-023, 2.41524136e-025],
       [6.58788939e-004, 0.00000000e+000, 1.09713837e-259, ...,
        6.57399923e-004, 2.22995457e-033, 1.31127587e-052],
       ...,
       [0.00000000e+000, 0.00000000e+000, 1.66689250e-004, ...,
        3.72590178e-009, 0.00000000e+000, 4.82070546e-005],
       [0.00000000e+000, 0.00000000e+000, 1.01314384e-008, ...,
        5.03812491e-003, 5.42389995e-004, 3.07437590e-004],
       [0.00000000e+000, 1.38948067e-015, 0.00000000e+000, ...,
        5.90420232e-017, 0.00000000e+000, 1.53628254e-004]])

In [18]:
print_top_words(nmf, n_grams, 30)

Topic #0: pressure   tentative   gov   canadian   agreement   amp   concerns   title   reached   rights   resolve   doesn   governments   cdn   come   titl   nation   talks   government   milestone   thread   cgl   good   lan   ag   govt   watch   details   stand   media

Topic #1: exportdevcanada   mary_ng   millions   34   reject   giving   000   calling   hey   coastalgaslink   justintrudeau   jjhorgan   bcrcmp   globalbc   ho   smacdonald__   loan   happening   dollars   carolyn_bennett   taxpayer   economic   weeks   canadianpm   thread   survival   north   matters   sitec   grassroots

Topic #2: proposed   ministers   reach   agreement   dispute   news   breaking   cbc   reached   federal   arrangement   deal   title   governments   government   details   provincial   rights   draft   nation   talks   wets   uwet   days   milestone   chief   say   icymi   recognize   saying

Topic #3: solidarity   students   support   today   nation   blockade   walkout   student   amp   action  

In [19]:
lda = LatentDirichletAllocation(n_components=N_TOPICS, 
                                learning_method='online')

In [20]:
L = lda.fit_transform(counts)

In [21]:
print_top_words(lda, raw_n_grams, 30)

Topic #0: did   blockades   thread   yes   elected   led   got   band   rally   actions   getting   ongoing   ctvnews   thanks   facebook   media   struggle   water   trudeau   political   2020   ads   sure   just   title   companies   leadership   linked   fossil   undrip

Topic #1: solidarity   support   coastal   gaslink   students   need   read   deal   work   statement   don   today   march   right   youth   land   walkout   isn   governments   federal   gas   student   know   make   walk   trudeau   issues   great   university   legal

Topic #2: hey   calling   youth   000   34   colonial   exportdevcanada   legislature   rail   giving   going   way   justintrudeau   iy4wetsuweten   trudeau   cbcnews   chief   love   fight   says   vicpdcanada   won   lot   women   night   foreign   tonight   live   racism   maybe

Topic #3: amp   pressure   rights   ve   let   reconciliation   today   minister   climate   land   nation   territory   cgl   talks   blockades   watch   great   look

In [22]:
lda.score(counts)

-295634.17725081113

In [23]:
lda.perplexity(counts)

5496.083995797883

In [24]:
lda.get_params()

{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 8,
 'n_jobs': None,
 'perp_tol': 0.1,
 'random_state': None,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}