# IR Project 4
## Topic modeling

### Load libraries and data

In [7]:
#!pip install bertopic
#!pip install demoji
#!pip install pyLDAvis



In [1]:
# Import libraries
import json
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from bertopic import BERTopic
import demoji

[nltk_data] Downloading package stopwords to /Users/maga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load data
infile = open('tweets.json','rb')
tweets = json.load(infile)
infile.close()

In [3]:
tweets[0]

{'poi_name': 'POTUS',
 'username': 'POTUS',
 'out_links': '',
 'reply_to': None,
 'poi_id': 1349149096909668363,
 'user_id': 1349149096909668363,
 'retweet': '',
 'quote_tweet': '',
 'verified': True,
 'country': 'USA',
 'id': 1462438527577346050,
 'replied_to_tweet_id': None,
 'replied_to_user_id': None,
 'reply_text': None,
 'tweet_text': 'With the Bipartisan Infrastructure Law, we will once again have the best roads, bridges, ports, and airports.\n\nWe will lead the world into the 21st century with modern cars and trucks and transit systems.\n\nWe will be building and moving again.',
 'tweet_lang': 'en',
 'text_hi': '',
 'text_es': '',
 'text_en': 'With the Bipartisan Infrastructure Law, we will once again have the best roads, bridges, ports, and airports.\n\nWe will lead the world into the 21st century with modern cars and trucks and transit systems.\n\nWe will be building and moving again.',
 'hashtags': [],
 'mentions': [],
 'tweet_urls': [],
 'tweet_emoticons': [],
 'tweet_date'

In [18]:
tweets[0]['tweet_text']

'Congratulations to the Washington Spirit on winning your first-ever NWSL championship. Despite a year with no shortage of challenges, you’ve made the District and your country very proud.'

### Preprocess data

In [59]:
# Function to remove stopwords
def remove_stopwords(text, lang): 
    text_wo_stopwords = []
    # remove stopwords   
    if lang == 'en':
        stop_words = set(stopwords.words('english'))
    else:
        stop_words = set(stopwords.words('spanish'))
    text = re.split('\s',text)
    for t in text:
        if t not in stop_words:
            text_wo_stopwords.append(t)
    text_wo_stopwords = ' '.join(text_wo_stopwords)
    return text_wo_stopwords

  text = re.split('\s',text)


In [60]:
# Function to preprocess tweets
def preprocess(raw_tweet,lang):
    text = raw_tweet.lower() # convert to lowercase
    text = re.sub('\n',' ',text) # remove '\n'
    text = re.sub(r"http\S+", "",text) # remove urls
    text = re.sub('#',' ',text) # remove '#' but leave text from hashtag
    text = re.sub('@[a-zA-Z]+',' ', text) # remove mentions
    text = re.sub('^rt ',' ', text) # remove 'rt'
    text = re.sub('[,\.\:\!¡\?\¿\_–-\’\$%|]',' ',text) # remove punctuation
    text = re.sub('[0-9]+', ' ',text) # remove numbers
    # emojis = list(demoji.findall(text).keys()) # in case we want to store emojis
    text = demoji.replace(text, '')
    text = re.sub('\s+',' ',text) # remove extra whitespaces
    text = re.sub('^\s+','',text) # remove space(s) at start
    text = re.sub('\s+$','',text) # remove space(s) at end
    #text = re.split('\s',text) # tokenize in terms of white space
    if lang == 'en' or 'es': # no stopwrods for hindi
        text = remove_stopwords(text,lang)
    return text

  text = re.sub('[,\.\:\!¡\?\¿\_–-\’\$%|]',' ',text) # remove punctuation
  text = re.sub('\s+',' ',text) # remove extra whitespaces
  text = re.sub('^\s+','',text) # remove space(s) at start
  text = re.sub('\s+$','',text) # remove space(s) at end


In [36]:
# Extract tweets per language
def get_tweets(tweets_dic):
    
    data_en, data_es, data_hi = [], [], []
    
    for tweet in tweets:
        text = tweet['tweet_text']
        if tweet['tweet_lang'] == 'en':
            text = preprocess(text,'en')
            data_en.append(text)
        elif tweet['tweet_lang'] == 'es':
            text = preprocess(text,'es')
            data_es.append(text)
        elif tweet['tweet_lang'] == 'hi':
            text = preprocess(text,'hi')
            data_hi.append(text) 
            
    return data_en, data_es, data_hi

In [61]:
data_en, data_es, data_hi = get_tweets(tweets)

In [62]:
data_en[0]

'bipartisan infrastructure law best roads bridges ports airports lead world st century modern cars trucks transit systems building moving'

In [39]:
data_es[0]

['hoy',
 'comienza',
 'mes',
 'nacional',
 'herencia',
 'hispana',
 'mes',
 'importante',
 'recordatorio',
 'fuerza',
 'nace',
 'nues…']

In [40]:
data_hi[0]

['कोरोना',
 'के',
 'विरुद्ध',
 'जागरूकता',
 'अभियान',
 'में',
 'मानव',
 'सेवा',
 'संस्थान',
 'के',
 'कार्यकर्ताओं',
 'द्वारा',
 'शाहबाद',
 'ब्लॉक',
 'के',
 'ग्राम',
 'बडारा',
 'में',
 'टीकाकरण…']

### Fit models

TO-DO:
- Fine tune hyperparameters
- Train BERT embeddings on our own data?
- Currently using unsupervised movel. Might want to try:
    - Semi-supervised model: create some labels to guide BERTopic to the extraction of topics for those labels. The documents for which we do not have labels are assigned a -1.
    - Guided/Seeded model: set a number of seed topics to guide the model -> might be better than semi-supervised

#### Hyperparameters:
- language: english (default) or multilingual (supports Spanish and Hindi)
- top_n_words: number of words per topic (suggested below 30 and between 10-20)
- n_gram_range: (1,2) would feature 'New York' in the topic representation
- min_topic_size: what the minimum size of a topic can be. The lower this value the more topics are created. Deafult is 10
- nr_topics: specifies, after training the topic model, the number of topics that will be reduced to. Use "auto" to automatically reduce topics that have a similarity of at least 0.9, do not maps all others.
- low_memory
- calculate_probabilities

#### English

Custom Embeddings

In [52]:
from sentence_transformers import SentenceTransformer

In [63]:
# Prepare embeddings
docs = data_en
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)

# Create topic model and use the custom embeddings
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs, embeddings)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  condensed_tree = condense_tree(single_linkage_tree,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  labels, probabilities, stabilities = get_clusters(condensed_tree,


In [64]:
freq = topic_model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,1371,-1_american_economy_jobs_better
1,0,203,0_meeting_meet_summit_rome
2,1,97,1_wildfires_fire_climate_hearing
3,2,91,2_vaccines_unvaccinated_hospitalization_likely
4,3,90,3_governor_california_cacomeback_rent
5,4,86,4_got_delhi_govt_sir
6,5,84,5_pavilion_assam_addressing_programme
7,6,75,6_vaccinecentury_mm_vaccination_indian
8,7,74,7_shri_ji_birthday_greetings
9,8,71,8_border_biden_crisis_southern


In [228]:
topic_model_en = BERTopic(verbose=True, min_topic_size=5,calculate_probabilities=True)
topics_en, probs_en = topic_model_en.fit_transform(data_en)

Batches:   0%|          | 0/139 [00:00<?, ?it/s]

2021-11-21 15:58:32,167 - BERTopic - Transformed documents to Embeddings
2021-11-21 15:58:36,496 - BERTopic - Reduced dimensionality with UMAP
2021-11-21 15:58:40,152 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [271]:
len(data_en)

4432

In [270]:
len(topics_en)

4432

In [272]:
len(probs_en)

4432

In [232]:
freq = topic_model_en.get_topic_info()
freq.head(10)
# -1 refers to all outliers and should typically be ignored 

Unnamed: 0,Topic,Count,Name
0,-1,1213,-1_congress_goa_workers_power
1,0,80,0_delhi_govt_school_arrangement
2,1,73,1_border_crisis_borders_migrants
3,2,52,2_ideals_mahatma_jayanti_bharati
4,3,49,3_wildfires_fire_firefighters_greenville
5,4,48,4_assam_addressing_paar_bengal
6,5,44,5_california_vaccinations_require_californians
7,6,41,6_vote_ballot_election_mail
8,7,38,7_ages_younger_acip_obesity
9,8,38,8_booster_pfizerbiontech_dose_shot


In [233]:
topic_nr = freq.iloc[9]["Topic"] 
topic_model_en.get_topic(topic_nr)

[('booster', 0.08325459814660419),
 ('pfizerbiontech', 0.054942695543649285),
 ('dose', 0.05129861624832379),
 ('shot', 0.050634485112047226),
 ('eligible', 0.026606738924967562),
 ('johnson', 0.019509091326252653),
 ('doses', 0.017419540925080648),
 ('fda', 0.017107826368214396),
 ('shots', 0.01638211227786373),
 ('immune', 0.016284529533173855)]

#### Spanish

In [225]:
topic_model_es = BERTopic(language="multilingual",verbose=True,min_topic_size=5,calculate_probabilities=True)
topics_es, probs_es = topic_model_es.fit_transform(data_es)

Batches:   0%|          | 0/82 [00:00<?, ?it/s]

2021-11-21 15:52:11,336 - BERTopic - Transformed documents to Embeddings
2021-11-21 15:52:18,386 - BERTopic - Reduced dimensionality with UMAP
2021-11-21 15:52:19,416 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [234]:
freq = topic_model_es.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,610,-1_gobierno_presidente_gracias_secretario
1,0,102,0_mexicanos_abogados_mexico_reunión
2,1,62,1_celebrandolavida_tradición_desfile_festejos
3,2,58,2_mujeres_género_mujer_mujeresrevolucionarias
4,3,54,3_abrazo_cariño_gracias_estimado
5,4,46,4_democracia_cambia_traicionaremos_nación
6,5,43,5_vacunas_biológicos_comunicadosalud_pfizer
7,6,41,6_participa_pública_saludsexual_saludreproductiva
8,7,38,7_dieta_alimentaciónsaludable_higiene_producto...
9,8,34,8_tócate_detecta_contraelcáncerdemamayoactúo_o...


In [236]:
topic_nr = freq.iloc[6]["Topic"]  # Select a frequent topic
topic_model_es.get_topic(topic_nr)

[('vacunas', 0.09738903069865736),
 ('biológicos', 0.036888807009199255),
 ('comunicadosalud', 0.03506884091459111),
 ('pfizer', 0.033013983906350026),
 ('vacunada', 0.029251722361036462),
 ('plannacionaldevacunación', 0.027387691654005484),
 ('biontech', 0.027387691654005484),
 ('pfizerbiontech', 0.025941837109852938),
 ('infórmateporsiteloperdiste', 0.021252617390186287),
 ('vacunadas', 0.019501148240690973)]

#### Hindi

In [231]:
topic_model_hi = BERTopic(language="multilingual",verbose=True,min_topic_size=5,calculate_probabilities=True)
topics_hi, probs_hi = topic_model_hi.fit_transform(data_hi)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

2021-11-21 16:03:29,880 - BERTopic - Transformed documents to Embeddings
2021-11-21 16:03:32,298 - BERTopic - Reduced dimensionality with UMAP
2021-11-21 16:03:32,394 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [237]:
freq = topic_model_hi.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,260,-1_वन_मन_बन_सम
1,0,39,0_शल_नई_षण_आग
2,1,35,1_जप_जनत_कभ_यम
3,2,35,2_करण_harghardastak_vaccines_rs
4,3,30,3_press_conference_वप_बचकर
5,4,30,4_गर_सरक_पह_हमन
6,5,26,5_नमन_शत_जय_रण
7,6,23,6_दशम_नर_खड_एम
8,7,23,7_नम_षत_मल_सत
10,8,22,8_लग_बच_तक_लगव


### Visualize topics

#### English

In [238]:
fig = topic_model_en.visualize_topics()
fig

In [245]:
topic_model_en.visualize_distribution(probs_en[10], min_probability=0.015)

In [246]:
topic_model_en.visualize_hierarchy(top_n_topics=20)

In [247]:
topic_model_en.visualize_barchart(top_n_topics=5)

In [248]:
topic_model_en.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [249]:
topic_model_en.visualize_term_rank()

#### Spanish

In [250]:
fig = topic_model_es.visualize_topics()
fig

In [258]:
topic_model_es.visualize_hierarchy(top_n_topics=20)

In [259]:
topic_model_es.visualize_barchart(top_n_topics=5)

In [260]:
topic_model_es.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [261]:
topic_model_es.visualize_term_rank()

#### Hindi

In [262]:
fig = topic_model_hi.visualize_topics()
fig

In [265]:
topic_model_hi.visualize_barchart(top_n_topics=5)

In [266]:
topic_model_hi.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [267]:
topic_model_hi.visualize_term_rank()

#### LDA (Latent Dirichlet Allocation)
Code adapted from https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/topic_modeling_Gensim.ipynb

In [6]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [7]:
def prepare_text_for_lda(text):
    tokens = text
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
import random
text_data = []
for line in data_en:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        print(tokens)
        text_data.append(tokens)

['deliver', 'remark', 'september', 'report']
['never', 'raise', 'tax', 'penny', 'share', 'reward', 'country', 'wealth']
['going', 'protect', 'vaccinate', 'worker', 'unvaccinated', 'coworkers']
['economy', 'administration', 'building', 'instead', 'worker', 'compete', 'scarce', 'employer', 'compete', 'attract', 'worker', 'worker', 'power', 'essential', 'building', 'economy', 'better']
['covid-', 'vaccine', 'highly', 'effective', 'highly', 'effective', 'vaccine', 'experience', 'reduction']
['working', 'tackle', 'century', 'pressing', 'challenge', 'together']
['message', 'caregiver', 'caregiver', 'matter', 'watch', 'seven', 'caregiver', 'background', 'reason…']
['order', 'buildbackbetter', 'start', 'woman', 'big', 'challenge', 'pandemic', 'transformative', 'vision', 'enable', 'woman', 'without', 'worry', 'uncertainty', 'health', 'family']
['statement', 'twenty', 'years', 'since', 'september', 'attack']
['since', 'people', 'evacuate', 'afghanistan', 'assistance', 'military', '&amp;', 'ally'

In [9]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [12]:
# Convert document into the bag-of-words format = list of (token_id, token_count)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [17]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

LDA Parameters:
- num_topics: number of requested latent topics to be extracted from the training corpus.
- id2word: apping from word IDs to words. It is used to determine the vocabulary size, as well as for debugging and topic printing.
- passes: number of passes through the corpus during training.

In [20]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [26]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)
# print_topics(num_topics=20, num_words=10) # To select num of topics to print
# print_topic(topicno, topn=10) # To print one topic

(0, '0.015*"union" + 0.015*"home" + 0.008*"proud" + 0.008*"allies" + 0.008*"democrat"')
(1, '0.026*"home" + 0.014*"reason" + 0.014*"coverage" + 0.014*"slavery" + 0.014*"class"')
(2, '0.017*"amendment" + 0.017*"historic" + 0.012*"appointment" + 0.012*"around" + 0.012*"respect"')
(3, '0.023*"economist" + 0.012*"chance" + 0.012*"passing" + 0.012*"pressure" + 0.012*"nobel"')
(4, '0.020*"field" + 0.013*"reason" + 0.013*"union" + 0.013*"promise" + 0.013*"home"')


In [31]:
#from pprint import pprint
#top_topics = list(ldamodel.top_topics(corpus))
#pprint(top_topics)

In [41]:
# Infer topic distribution on new, unseen documents
new_doc = 'New Covid cases in the U.S. have increased by 25% in the past two weeks. In 14 states, cases have climbed by 40% or more.'
new_doc = preprocess(new_doc,'en')
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(254, 1), (295, 2), (428, 1)]
[(0, 0.5970222), (1, 0.040337805), (2, 0.28200385), (3, 0.0403021), (4, 0.040334057)]


In [42]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.029*"means" + 0.029*"major" + 0.015*"appointment" + 0.015*"allies" + 0.015*"announcement"')
(1, '0.021*"democrat" + 0.021*"union" + 0.021*"would" + 0.011*"appointment" + 0.011*"around"')
(2, '0.012*"value" + 0.012*"university" + 0.012*"support" + 0.012*"buildbackbetter" + 0.012*"celebrate"')
(3, '0.041*"field" + 0.028*"this…" + 0.015*"different" + 0.015*"right" + 0.015*"playing"')
(4, '0.016*"economist" + 0.016*"prisoner" + 0.016*"working" + 0.016*"underlie" + 0.016*"deserve"')
(5, '0.027*"amendment" + 0.018*"union" + 0.018*"home" + 0.018*"promise" + 0.018*"historic"')
(6, '0.015*"folks" + 0.015*"home" + 0.015*"reason" + 0.015*"slavery" + 0.015*"livelihood"')
(7, '0.032*"home" + 0.017*"class" + 0.017*"coverage" + 0.017*"update" + 0.017*"leadership"')
(8, '0.038*"economist" + 0.026*"nobel" + 0.026*"passing" + 0.026*"pressure" + 0.014*"future"')
(9, '0.033*"reason" + 0.022*"slavery" + 0.022*"decency" + 0.022*"country" + 0.022*"middle"')


#### Visualization with pyLDAvis

In [49]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [50]:
import pyLDAvis
import pyLDAvis.gensim_models
lda_display = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
