In [1]:
from __future__ import unicode_literals
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import sklearn

In [2]:
from sklearn.pipeline import Pipeline

# Tutorials:
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF # Non-negative matrix factorization

In [3]:
pd.set_option("display.max_colwidth", 100)

In [4]:
df = pd.read_csv('aggregation.csv')
df.head()

Unnamed: 0,text,retweet_count,favorite_count,username,userdesc,verified,followers,created_at,has_urls,has_mentions,high_response
0,They are starting to get more and more desperate...\nThis shows that we’re winning. https://t.co...,24298,168648,GretaThunberg,17 year old climate and environmental activist with Asperger’s #climatestrike #fridaysforfuture...,True,4079169,2020-02-29 15:26:10,True,False,True
1,I do not believe we will defeat Donald Trump with a candidate like Joe Biden who supported the I...,18592,77895,BernieSanders,U.S. Senator from Vermont and candidate for President of the United States.,True,10951634,2020-03-02 20:30:56,False,False,True
2,Indigenous rights = Climate justice\n#WetsuwetenStrong #KeepItInTheGround https://t.co/1kYNumyoQT,4609,21488,GretaThunberg,17 year old climate and environmental activist with Asperger’s #climatestrike #fridaysforfuture...,True,4086646,2020-02-08 13:36:48,True,False,True
3,"Stop running away from your problem. Run into your problem. It will suck. Really suck. But, wh...",2739,16317,pulte,"The Philanthropist. Inventor of Twitter Philanthropy. Giving Money, Food, and Rent To People In ...",True,2059165,2020-02-29 21:19:22,True,False,True
4,Support the Wet’suwet’en Nation and the pipeline protests happening now in Canada! #WetsuwenStro...,2972,10035,GretaThunberg,17 year old climate and environmental activist with Asperger’s #climatestrike #fridaysforfuture...,True,4091979,2020-02-18 10:13:02,True,False,True


In [5]:
corpus = list(df['text'])

In [6]:
# Remove URL junk
url_pattern = re.compile("http[^\s]+", re.I)

for i in range(len(corpus)):
    corpus[i] = url_pattern.sub('', corpus[i])

In [7]:
non_ascii_pattern = re.compile("[^\u0000-\u2300]")

In [32]:
# from tutorial
def print_top_words(model, feature_names, n_top_words):
    for i, topic in enumerate(model.components_):
        message = "Topic #%d: " % i
        message += "   ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        print()
    print()

In [27]:
N_TOPICS = 8

In [8]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

add_stop_words = ['wetsuweten', 'wet', 'suwet', 'en',
                  'wetsuwetenstrong', 'wetsuwetensolidarity', 'shutdowncanada',
                    'bc', 'british', 'columbia', 'canada', 'indigenous', 'pipeline',
                 'hereditary', 'chiefs']

custom_stop_words = set(ENGLISH_STOP_WORDS).union(set(add_stop_words))

In [41]:
# Text frequency*Inverse document frequency matrix
# max_df: ignore terms with frequency higher than
# lowercase = True by default
matrix = TfidfVectorizer(strip_accents='unicode',
                          analyzer='word',
                          stop_words=custom_stop_words,
                          ngram_range=(1,1))
M = matrix.fit_transform(corpus)

In [42]:
n_grams = matrix.get_feature_names()
print("TfIdf N-grams:", len(n_grams))

TfIdf N-grams: 8388


In [43]:
# Use raw counts (not TfIdf) for LDA
count_vect = CountVectorizer(strip_accents='unicode',
                             analyzer='word',
                             stop_words=custom_stop_words,
                             ngram_range=(1,1))
counts = count_vect.fit_transform(corpus)

In [44]:
raw_n_grams = count_vect.get_feature_names()

In [45]:
# n_components: if unset, all features used
# solver: multiplicative update optimization (used in topic modelling examples)
# https://mlexplained.com/2017/12/28/a-practical-introduction-to-nmf-nonnegative-matrix-factorization/
nmf = NMF(n_components=N_TOPICS, solver='mu')

In [46]:
W = nmf.fit_transform(M)

In [47]:
nmf.components_

array([[1.57774483e-160, 3.61599067e-100, 6.00856959e-004, ...,
        3.10675338e-050, 1.72849993e-163, 8.90580307e-033],
       [1.20499047e-021, 8.28161472e-001, 3.03715051e-023, ...,
        3.53695620e-043, 2.45675344e-023, 3.09979410e-024],
       [6.83220665e-004, 0.00000000e+000, 1.89165886e-256, ...,
        6.74570183e-004, 8.90065713e-033, 3.53358272e-050],
       ...,
       [0.00000000e+000, 0.00000000e+000, 3.39546630e-008, ...,
        4.85965275e-007, 0.00000000e+000, 5.02176876e-005],
       [0.00000000e+000, 0.00000000e+000, 5.32611923e-004, ...,
        5.03695731e-003, 5.19770659e-004, 3.12108577e-004],
       [0.00000000e+000, 3.32606976e-015, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 1.52802276e-004]])

In [48]:
print_top_words(nmf, n_grams, 30)

Topic #0: pressure   tentative   gov   canadian   agreement   amp   concerns   title   reached   rights   resolve   doesn   governments   cdn   come   titl   nation   cgl   talks   government   milestone   thread   good   lan   ag   govt   watch   details   media   does

Topic #1: exportdevcanada   mary_ng   millions   34   reject   giving   000   calling   hey   coastalgaslink   justintrudeau   jjhorgan   bcrcmp   globalbc   ho   smacdonald__   loan   happening   dollars   carolyn_bennett   taxpayer   economic   weeks   thread   canadianpm   survival   north   matters   sitec   grassroots

Topic #2: proposed   ministers   reach   agreement   dispute   news   breaking   cbc   reached   federal   arrangement   deal   title   governments   government   details   provincial   rights   draft   nation   talks   wets   uwet   days   milestone   chief   say   icymi   recognize   saying

Topic #3: solidarity   students   today   support   nation   blockade   walkout   student   amp   action   

In [49]:
lda = LatentDirichletAllocation(n_components=N_TOPICS, 
                                learning_method='online')

In [50]:
L = lda.fit_transform(counts)

In [51]:
print_top_words(lda, raw_n_grams, 30)

Topic #0: agreement   amp   gov   tentative   pressure   deal   proposed   coastal   gaslink   gas   don   says   know   ministers   th   title   rights   government   people   news   land   federal   peoples   reached   consent   world   issue   dispute   reach   doesn

Topic #1: canadian   blockades   support   people   government   like   rail   amp   way   blockade   youth   legislature   right   new   work   solidarity   iy4wetsuweten   white   oil   today   did   nation   week   leaders   protests   know   truth   saying   just   big

Topic #2: cdnpoli   rcmp   bcpoli   trudeau   amp   reconciliation   minister   unistoten   actions   live   general   scottfraserndp   talks   breaking   ndp   solicitor   say   press   injunction   release   conference   set   fraser   cgl   injunctions   law   iy4wetsuweten   meeting   justin   pm

Topic #3: nations   need   march   don   really   toronto   got   sure   lands   lot   industry   leader   better   amp   didn   doing   liberal   rac