## 0. Install and load packages 

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount=True)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
!pip install sentence_transformers
!pip install umap-learn
!pip install hdbscan
!pip install bertopic[flair] #[all]



In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np
import pandas as pd
import pickle
from bertopic import BERTopic


## 1. Load and combine datasets

In [6]:
dataset_list = ['df_v3_1_200000','df_v3_200000_400000','df_v3_400000_600000','df_v3_600000_800000','df_v3_800000_850000','df_v3_850000_905573']
dfs = [pd.read_csv('gdrive/My Drive/nlp-tm-proj/PolUSA/PolUSA-data/{}.csv'.format(dataset)) for dataset in dataset_list]
dfs = pd.concat(dfs)
dfs.reset_index(drop = True, inplace = True)

  if self.run_code(code, result):


In [7]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905573 entries, 0 to 905572
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         905573 non-null  object 
 1   date_publish               905572 non-null  object 
 2   outlet                     905573 non-null  object 
 3   headline                   905547 non-null  object 
 4   lead                       840629 non-null  object 
 5   body                       905572 non-null  object 
 6   authors                    732635 non-null  object 
 7   domain                     905571 non-null  object 
 8   url                        905571 non-null  object 
 9   political_leaning          905571 non-null  object 
 10  headline_len               905545 non-null  float64
 11  body_len                   905571 non-null  float64
 12  headline_lang              905547 non-null  object 
 13  body_lang                  90

In [8]:
# only keep the rows with conditions: 
## 1. body is English 2. body is meaningful and not null
dfs = dfs[(dfs.if_eng_body==1)&(dfs.if_meaningful_body==1)&(dfs.body==dfs.body)]

In [9]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 898376 entries, 0 to 905572
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         898376 non-null  object 
 1   date_publish               898376 non-null  object 
 2   outlet                     898376 non-null  object 
 3   headline                   898350 non-null  object 
 4   lead                       833458 non-null  object 
 5   body                       898376 non-null  object 
 6   authors                    727020 non-null  object 
 7   domain                     898376 non-null  object 
 8   url                        898376 non-null  object 
 9   political_leaning          898376 non-null  object 
 10  headline_len               898350 non-null  float64
 11  body_len                   898376 non-null  float64
 12  headline_lang              898350 non-null  object 
 13  body_lang                  89

In [10]:
### use only a portion of data over 2017-2019 for testing ###
dfs.date_publish=pd.to_datetime(dfs.date_publish)

dfs_overtime= []
for year in range(2017,2020):
    for month in range(1,13): 
        dfs_overtime.append(dfs[dfs.date_publish.dt.year ==year][dfs.date_publish.dt.month ==month][:300])

dfs_overtime = pd.concat(dfs_overtime)
dfs_overtime.date_publish.describe()

count                    9600
unique                   3803
top       2017-03-01 00:00:00
freq                      300
first     2017-01-01 00:00:00
last      2019-08-01 04:43:57
Name: date_publish, dtype: object

In [11]:
dfs_overtime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9600 entries, 0 to 882175
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         9600 non-null   object        
 1   date_publish               9600 non-null   datetime64[ns]
 2   outlet                     9600 non-null   object        
 3   headline                   9600 non-null   object        
 4   lead                       7748 non-null   object        
 5   body                       9600 non-null   object        
 6   authors                    8191 non-null   object        
 7   domain                     9600 non-null   object        
 8   url                        9600 non-null   object        
 9   political_leaning          9600 non-null   object        
 10  headline_len               9600 non-null   float64       
 11  body_len                   9600 non-null   float64       
 12  head

## 2. Fit BerTopic model

- prepare custom embeddings using tfidf, umap, hdbscan and vectorizor models with the paramters below
- use Bertopic to wrap them
- fit a Topic model

### 2.1 Prepare tfidf embeddings

In [None]:
vectorizer = TfidfVectorizer(min_df=5) #, ngram_range=(1, 2))
embeddings = vectorizer.fit_transform(dfs_overtime.body.tolist())

In [None]:
# save tfidf embeddings for body
with open('tfidf_embeddings_body.pkl', "wb") as fOut:
    pickle.dump({'bodies': dfs_overtime.body.tolist(), 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
!cp -r tfidf_embeddings_body.pkl "gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/"

In [None]:
# load tfidf embeddings
'''
with open('tfidf_embeddings_body.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['bodies']
    stored_embeddings = stored_data['embeddings']
'''    

### 2.2 Prepare umap, hdbscan and CountVectorizer as the base for c-tfidf

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='hellinger')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


In [None]:
# consider adding more words to stop_words list such as:
#added_stopwords = ['like', 'says', 'say', 'just', 'scource text', 'company coverage','really', "that's",'thats', 'im','going']
# consider lemma
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words= 'english') # text.ENGLISH_STOP_WORDS.union(added_stopwords) )


### 2.3 Fit topic model 

In [None]:
# Pass the custom models to BERTopic
topic_model = BERTopic(umap_model=umap_model, #non-linear dimension reduction
                       hdbscan_model=hdbscan_model, 
                       vectorizer_model=vectorizer_model, # CV is the base for c-tfidf
                       calculate_probabilities = False, #Whether to calculate the topic probabilities. This could slow down the extraction of topics if you have many documents (> 100_000).
                       verbose = True, #Changes the verbosity of the model, Set to True if you want to track the stages of the model.
                       nr_topics = None,
                       low_memory = True, ##If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics that have a similarity of at least 0.9, do not maps all others.
                       top_n_words = 10) 

In [None]:
#topics: topic_id for each doc. (#docs, 1)
#probabilities: for each doc, prob of each topic. assign the doc to the topic with the highest prob (#docs, #topics)
topics, probabilities = topic_model.fit_transform(dfs_overtime.body.tolist(),embeddings)


2021-05-22 20:37:11,145 - BERTopic - Reduced dimensionality with UMAP
2021-05-22 20:37:11,714 - BERTopic - Clustered UMAP embeddings with HDBSCAN


## 3. Save and load the default TM model

The default model for body:
- using tfidf for embeddings 
- using the parameters above for umap, sdbscan and CV

In [None]:
### save the default model
topic_model.save("default_model_body", save_embedding_model=False)
!cp -r default_model_body "gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/"

In [None]:
topic_model.get_params()

{'calculate_probabilities': False,
 'embedding_model': None,
 'hdbscan_model': HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
         approx_min_span_tree=True, cluster_selection_epsilon=0.0,
         cluster_selection_method='eom', core_dist_n_jobs=4,
         gen_min_span_tree=False, leaf_size=40,
         match_reference_implementation=False, memory=Memory(location=None),
         metric='euclidean', min_cluster_size=10, min_samples=None, p=None,
         prediction_data=True),
 'language': 'english',
 'low_memory': True,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': 10,
 'top_n_words': 10,
 'umap_model': UMAP(a=None, angular_rp_forest=True, b=None, dens_frac=0.0, dens_lambda=0.0,
      dens_var_shift=0.1, densmap=False, disconnection_distance=None,
      force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
      local_connectivity=1.0, low_memory=True, metric='hellinger',
      metric_kwds=None, min_dist=0.1, n_components=5, n_

In [None]:
### load the default model
'''
# the model can't be loaded properly as below
topic_model = BERTopic.load("gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/default_model_body") 
'''

TypingError: ignored

## 4. Visualize the default model

In [None]:
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0]-1} topics")

4276 documents have not been classified
The other 5324 documents are 140 topics


### 4.1 Get topics and the number of docs within each topic

In [None]:
topic_freq[:10]

Unnamed: 0,Topic,Count
0,-1,4276
1,129,330
2,54,271
3,91,142
4,22,135
5,127,114
6,9,105
7,112,103
8,17,101
9,108,98


In [None]:
print(f"There are {topic_freq['Count'].iloc[1]} documents that are talking about topic ID {topic_freq['Topic'].iloc[1]}")

There are 330 documents that are talking about topic ID 129


### 4.2 Get keywords and their c-TF-IDF scores for a certain topic

- Return top n words for a specific topic and their c-TF-IDF scores

In [None]:
# keywords and their probs in Topic ID 54
topic_model.get_topic(topic_freq['Topic'].iloc[2])

[('north', 0.03408401362964749),
 ('korea', 0.030525282960766665),
 ('north korea', 0.023665150119960156),
 ('kim', 0.02339415898813008),
 ('korean', 0.02316172118130664),
 ('south', 0.015368217517333279),
 ('nuclear', 0.014588461690586292),
 ('north korean', 0.013963176753811937),
 ('koreas', 0.012008440962450172),
 ('south korea', 0.011169532905012094),
 ('jong', 0.00984935357677524),
 ('kim jong', 0.00974644992617186),
 ('north koreas', 0.00864480348674876),
 ('missile', 0.00828795736784587),
 ('pyongyang', 0.007618905223593037),
 ('south korean', 0.007446572428442533),
 ('summit', 0.006773013502126493),
 ('military', 0.005963258898251201),
 ('weapons', 0.005685602500364703),
 ('koreans', 0.00565084335678844),
 ('seoul', 0.005516927509518263),
 ('japan', 0.005387716432052546),
 ('united', 0.005077184711707496),
 ('sanctions', 0.004960082358293488),
 ('war', 0.004946693967933465),
 ('leader', 0.0049404147728636634),
 ('trump', 0.004918936613077215),
 ('leader kim', 0.0049166971760884

### 4.3 Get information about each topic including its id, frequency, and name



In [None]:
topic_model.get_topic_info()[:30]

Unnamed: 0,Topic,Count,Name
0,-1,4276,-1_people_police_new_trump
1,129,330,129_know_like_think_just
2,54,271,54_north_korea_north korea_kim
3,91,142,91_climate_climate change_paris_emissions
4,22,135,22_audio available_available later_later today...
5,127,114,127_biden_harris_debate_booker
6,9,105,9_israel_palestinian_palestinians_hamas
7,112,103,112_brexit_eu_uk_deal
8,17,101,17_bush_george_bushs_hw
9,108,98,108_hiv_health_disease_patients


### 4.4 Find topics most similar to a search_term

- Creates an embedding for search_term and compares that with the topic embeddings. The most similar topics are returned along with their similarity values.

- The search_term can be of any size but since it compares with the topic representation it is advised to keep it below 5 words.

- This method can only be used if you did not use custom embeddings.

In [None]:
topic_model.find_topics("brexit", top_n=5) 

### 4.5 Visualize topics

In [None]:
topic_model.visualize_topics()

In [None]:
# need to set: calculate_probabilities = True for visualize_distribution
topic_model.visualize_distribution(probabilities[0])

## 5. Update dataset:

- Add columns obtained from topic_model to the current df
- columns added: 'topic_id', 'top_30_keywords_with_score', 'top_30_keywords', 'Topic','Count', 'Name'

In [None]:
#topics, probabilities = topic_model.fit_transform(dfs.headline.tolist())

In [None]:
dfs_overtime['topic_id'] = topics
dfs_overtime['top_30_keywords_with_score'] = dfs_overtime.apply(lambda x: topic_model.get_topic(x.topic_id), axis = 1)

In [None]:
# extract all keywords from kw_score_column and tranfer them to an added column kw_column. Then, update the dataframe. 
def extract_keywords(df, kw_score_column, kw_column):
    df[kw_column] =''
    for i in df.index.tolist():
        df[kw_column][i] = [items[0] for items in df[kw_score_column][i]]
    return df

In [None]:
dfs_overtime = extract_keywords(dfs_overtime, 'top_30_keywords_with_score', 'top_30_keywords')
dfs_overtime = pd.merge(dfs_overtime,topic_model.get_topic_info(), left_on=['topic_id'], right_on=['Topic'], how ='left') #dfs' indexes get reset

In [None]:
dfs_overtime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9600 entries, 0 to 9599
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          9600 non-null   object        
 1   date_publish                9600 non-null   datetime64[ns]
 2   outlet                      9600 non-null   object        
 3   headline                    9600 non-null   object        
 4   lead                        7748 non-null   object        
 5   body                        9600 non-null   object        
 6   authors                     8191 non-null   object        
 7   domain                      9600 non-null   object        
 8   url                         9600 non-null   object        
 9   political_leaning           9600 non-null   object        
 10  headline_len                9600 non-null   float64       
 11  body_len                    9600 non-null   float64     

In [None]:
dfs_overtime.sample(3)

Unnamed: 0,id,date_publish,outlet,headline,lead,body,authors,domain,url,political_leaning,headline_len,body_len,headline_lang,body_lang,if_eng_body,if_eng_headline,if_eng_headline_nonstrict,preprocessed_headline,preprocessed_body,if_meaningful_headline,if_meaningful_body,topic_id,top_30_keywords_with_score,top_30_keywords,Topic,Count,Name
7413,3888368,2019-01-01 10:08:36,HuffPost,Trump: Ask Warren's Psychiatrist If She Thinks...,"In a New Year's Eve Q&A with Fox News, Trump m...",Sen. Elizabeth Warren (D-Mass.) may be aiming ...,Amy Russo;Trends Reporter,www.huffingtonpost.com,https://www.huffingtonpost.com/entry/trump-war...,LEFT,13.0,368.0,[en:0.9999961861552729],[en:0.999996115442866],1,1,1,Trump: Ask Warren's Psychiatrist If She Thinks...,Sen. Elizabeth Warren (D-Mass.) may be aiming ...,1,1,-1,"[(people, 0.0025383992842847366), (police, 0.0...","[people, police, new, trump, years, state, yea...",-1,4276,-1_people_police_new_trump
8060,38908213,2019-03-01 05:30:06,The Guardian,Shadow of Ulster in the Welsh valleys - archiv...,1 March 1989: The polarisation of Welsh nation...,The daffodils bloomed on time for St David’s D...,Tony Heath,www.theguardian.com,https://www.theguardian.com/politics/2019/mar/...,LEFT,13.0,894.0,[en:0.9999974863611485],[en:0.9999939521280423],1,1,1,Shadow of Ulster in the Welsh valleys - archiv...,The daffodils bloomed on time for St David’s D...,1,1,-1,"[(people, 0.0025383992842847366), (police, 0.0...","[people, police, new, trump, years, state, yea...",-1,4276,-1_people_police_new_trump
3985,115894146,2018-02-01 00:00:00,ABC News,Military looking at possible cellphone ban at ...,Military looking at possible cellphone ban at ...,The Defense Department is reviewing whether ce...,Abc News;More Luis,abcnews.go.com,http://abcnews.go.com/Politics/military-cellph...,CENTER,9.0,588.0,[en:0.9999963490015933],[en:0.9999959970278767],1,1,1,Military looking at possible cellphone ban at ...,The Defense Department is reviewing whether ce...,1,1,-1,"[(people, 0.0025383992842847366), (police, 0.0...","[people, police, new, trump, years, state, yea...",-1,4276,-1_people_police_new_trump


## 6. Topics per class

- see how certain topics are represented over documents with certain political leaning (center, left, right, undefined). 


In [None]:
# set verbose = true when fitting the model. 
# topics_per_class() doesn't work with the model after .reduce_topics()
topics_per_class = topic_model.topics_per_class(dfs_overtime.body.tolist(), topics, classes=dfs_overtime.political_leaning.tolist())

4it [00:50, 12.51s/it]


In [None]:
topics_per_class.sort_values(by = 'Topic')

Unnamed: 0,Topic,Words,Frequency,Class
0,-1,"police, county, image, people, years",1131,UNDEFINED
232,-1,"people, says, new, years, npr",1631,LEFT
362,-1,"trump, police, news, state, percent",1062,CENTER
126,-1,"police, news, people, trump, new",452,RIGHT
363,0,"notifications breaking, alerttag, news alertta...",11,CENTER
...,...,...,...,...
361,137,"marysville, banos, los banos, sheriffs, suspect",1,LEFT
231,137,"deputies, spurlock, riehl, sheriffs, suspect",5,RIGHT
487,137,"spurlock, riehl, deputies, sheriffs, sheriff",13,CENTER
124,138,"vehicle, lake, buenosanchez, police, sheriffs",13,UNDEFINED


In [None]:
topic_model.visualize_topics_per_class(topics_per_class, top_n=10)

## 7. Dynamic Topic Modeling

- see how a topic is represented across different times.

In [None]:
topics_over_time = topic_model.topics_over_time(dfs_overtime.body.tolist(), topics, dfs_overtime.date_publish.tolist(),nr_bins=20, global_tuning=True, evolution_tuning=True)

20it [16:30, 49.54s/it]


In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n=10)


## 8. Update default model: reduce the number of topics to 10

In [None]:
new_topics, new_probs = topic_model.reduce_topics(dfs_overtime.headline.tolist(), topics, probabilities, nr_topics=10)

2021-05-22 21:51:56,015 - BERTopic - Reduced number of topics from 141 to 11


In [None]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name
0,-1,7924,-1_trump_new_says_police
1,129,330,129_trump_2017_people_good
2,54,284,54_north_korea_north korea_kim
3,17,148,17_bush_george_george hw_hw
4,91,142,91_climate_paris_climate change_paris climate
5,34,138,34_iran_protests_death toll_toll
6,22,135,22_bush_george hw_hw_george
7,51,128,51_tariffs_trade_canada_china
8,72,126,72_russian_russia_putin_treaty
9,94,123,94_kavanaugh_supreme_supreme court_court


In [None]:
topic_model.visualize_topics()

In [None]:
topics_over_time = topic_model.topics_over_time(dfs_overtime.body.tolist(), new_topics, dfs_overtime.date_publish.tolist(),nr_bins=20, global_tuning=True, evolution_tuning=True)

20it [00:16,  1.24it/s]


In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n=10)

## 9. Fit another bertopic model with pooled Glove embeddings from Flair

### 9.1 Fit the model

In [None]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
#use Flair to use word embeddings and pool them to create document embeddings. 
#Under the hood, Flair simply averages all word embeddings in a document
glove_embedding = WordEmbeddings('crawl')
document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])


2021-05-22 21:55:05,100 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpokszcazs


100%|██████████| 1200000128/1200000128 [00:30<00:00, 39043605.79B/s]

2021-05-22 21:55:35,936 copying /tmp/tmpokszcazs to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M.vectors.npy





2021-05-22 21:55:40,105 removing temp file /tmp/tmpokszcazs
2021-05-22 21:55:40,562 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M not found in cache, downloading to /tmp/tmptamr1ni5


100%|██████████| 39323680/39323680 [00:01<00:00, 34483476.54B/s]

2021-05-22 21:55:41,799 copying /tmp/tmptamr1ni5 to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M
2021-05-22 21:55:41,852 removing temp file /tmp/tmptamr1ni5





In [None]:
document_glove_embeddings

DocumentPoolEmbeddings(
  fine_tune_mode=none, pooling=mean
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('crawl')
  )
)

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


In [None]:
# consider adding more words to stop_words list such as:
#added_stopwords = ['like', 'says', 'say', 'just', 'scource text', 'company coverage','really', "that's", 'http'] ...
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words= 'english')#text.ENGLISH_STOP_WORDS.union(added_stopwords) )


In [None]:
# Pass the custom models to BERTopic
topic_model_dge = BERTopic(embedding_model=document_glove_embeddings,
                       umap_model=umap_model, #non-linear dimension reduction
                       hdbscan_model=hdbscan_model, 
                       vectorizer_model=vectorizer_model, # CV is the base for c-tfidf
                       calculate_probabilities = False, #Whether to calculate the topic probabilities. This could slow down the extraction of topics if you have many documents (> 100_000).
                       verbose = True, #Changes the verbosity of the model, Set to True if you want to track the stages of the model.
                       nr_topics = None,
                       low_memory = True,
                       top_n_words = 10) #If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics that have a similarity of at least 0.9, do not maps all others.

In [19]:
#topics: topic_id for each doc. (#docs, 1)
#probabilities: for each doc, prob of each topic. assign the doc to the topic with the highest prob (#docs, #topics)
topics_dge, probabilities_dge = topic_model_dge.fit_transform(dfs_overtime.body.tolist())

9600it [10:17, 15.54it/s]


### 9.2. Save and load model

- using flair.embeddings.DocumentPoolEmbeddings with Glove
- using the parameters above for umap, sdbscan and CV

In [None]:
### save the default model
topic_model_dge.save("default_model_dge_body", save_embedding_model=True)
!cp -r default_model_dge_body "gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/"

In [None]:
### load the default model
'''
topic_model_dge = BERTopic.load("gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/default_model_dge_body") 
'''

In [13]:
topic_model_dge.get_params()

{'calculate_probabilities': False,
 'embedding_model': <bertopic.backend._flair.FlairBackend at 0x7f6f9f111f50>,
 'hdbscan_model': HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
         approx_min_span_tree=True, cluster_selection_epsilon=0.0,
         cluster_selection_method='eom', core_dist_n_jobs=4,
         gen_min_span_tree=False, leaf_size=40,
         match_reference_implementation=False, memory=Memory(location=None),
         metric='euclidean', min_cluster_size=10, min_samples=None, p=None,
         prediction_data=True),
 'language': None,
 'low_memory': True,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'top_n_words': 10,
 'umap_model': UMAP(a=None, angular_rp_forest=True, b=None, dens_frac=0.0, dens_lambda=0.0,
      dens_var_shift=0.1, densmap=False, disconnection_distance=None,
      force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
      local_connectivity=1.0, low_memory=True, metric='cosine', metric_kwd

### 9.3 Visualize

In [None]:
topic_freq = topic_model_dge.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0]-1} topics")

3416 documents have not been classified
The other 6184 documents are 146 topics


### 9.4 Get topics and the number of docs within each topic

In [None]:
topic_freq[:10]

Unnamed: 0,Topic,Count
0,-1,3416
1,70,497
2,89,231
3,132,203
4,22,193
5,82,181
6,94,153
7,123,152
8,51,122
9,145,115


In [None]:
print(f"There are {topic_freq['Count'].iloc[1]} documents that are talking about topic ID {topic_freq['Topic'].iloc[1]}")

There are 364 documents that are talking about topic ID 172


### 9.5 Get keywords and their c-TF-IDF scores for a certain topic

- Return top n words for a specific topic and their c-TF-IDF scores

In [None]:
# keywords and their probs in Topic ID 125
topic_model_dge.get_topic(topic_freq['Topic'].iloc[1])

[('police', 0.014258544798585434),
 ('officers', 0.00725119088966222),
 ('officer', 0.006294647132811852),
 ('man', 0.0060984223068627835),
 ('police said', 0.005742315893106336),
 ('sheriff', 0.005575654805635071),
 ('virginia beach', 0.0041118563461501785),
 ('authorities', 0.0038453715009101365),
 ('incident', 0.003716983054251137),
 ('woman', 0.00356650228354664)]

### 9.6 Get information about each topic including its id, frequency, and name



In [None]:
topic_model_dge.get_topic_info()[:30]

Unnamed: 0,Topic,Count,Name
0,-1,3416,-1_president_people_state_government
1,70,497,70_police_officers_officer_man
2,89,231,89_npr_think_martin_shapiro
3,132,203,132_biden_democrats_republican_presidential
4,22,193,22_korea_north korea_korean_south korea
5,82,181,82_city_village_county_park
6,94,153,94_hiv_patients_drug_disease
7,123,152,123_like_just_people_know
8,51,122,51_syria_syrian_mosul_iraqi
9,145,115,145_http_politi_http bit_http politi


### 9.7 Find topics most similar to a search_term

- Creates an embedding for search_term and compares that with the topic embeddings. The most similar topics are returned along with their similarity values.

- The search_term can be of any size but since it compares with the topic representation it is advised to keep it below 5 words.

- This method can only be used if you did not use custom embeddings.

In [None]:
topic_model_dge.find_topics("brexit", top_n=5) 

([101, 102, 53, 139, 37],
 [0.8479776562826695,
  0.7399642294158476,
  0.5369695236072096,
  0.5332558061779068,
  0.5301442807960397])

### 9.8 Visualize topics

In [14]:
topic_model_dge.visualize_topics()

In [None]:
# need to set: calculate_probabilities = True for visualize_distribution
topic_model.visualize_distribution(probabilities[0])

### 9.9. Topics per class

- see how certain topics are represented over documents with certain political leaning (center, left, right, undefined). 


In [None]:
# set verbose = true when fitting the model. 
# topics_per_class() doesn't work with the model after .reduce_topics()
topics_per_class = topic_model_dge.topics_per_class(dfs.body.tolist(), topics_dge, classes=dfs.political_leaning.tolist())

4it [01:03, 15.88s/it]


In [None]:
topics_per_class.sort_values(by = 'Topic')

Unnamed: 0,Topic,Words,Frequency,Class
0,-1,"president, house, state, government, office",764,UNDEFINED
245,-1,"people, president, government, state, country",1343,LEFT
386,-1,"trump, president, state, government, donald",948,CENTER
132,-1,"president, news, american, breitbart, national",361,RIGHT
387,0,"company coverage, source text, text eikon, eik...",15,CENTER
...,...,...,...,...
384,144,"early stories, early headlines, founder time, ...",14,LEFT
244,145,"fox, fox news, news, president trump, news run...",13,RIGHT
385,145,"npr, july 31, august 2019, pic twitter, david",78,LEFT
131,145,"http, politi, http bit, http politi, playbook ...",18,UNDEFINED


In [None]:
topic_model_dge.visualize_topics_per_class(topics_per_class, top_n=10)

### 9.10 Dynamic Topic Modeling

In [20]:
topics_over_time_dge = topic_model_dge.topics_over_time(dfs_overtime.body.tolist(), topics_dge, dfs_overtime.date_publish.tolist(),nr_bins=20, global_tuning=True, evolution_tuning=True)

20it [14:40, 44.04s/it]


In [28]:
topics_over_time_dge[topics_over_time_dge.Topic ==133].sort_values(by = 'Timestamp')

Unnamed: 0,Topic,Words,Frequency,Timestamp,Name
84,133,"schumer, democrats, voters, senator, biden",3,2016-12-31 01:23:14.163000064,133_biden_democrats_republican_president...
176,133,"sanders, democrats, election, clinton, bernie",5,2017-02-17 02:38:11.849999872,133_biden_democrats_republican_president...
240,133,"republican, gubernatorial, democrats, election...",5,2017-04-05 05:16:23.700000000,133_biden_democrats_republican_president...
322,133,"biden, democratic, democrats, clinton, vice pr...",6,2017-05-22 07:54:35.550000128,133_biden_democrats_republican_president...
387,133,"republicans, republican, democratic, gop, biden",9,2017-07-08 10:32:47.400000000,133_biden_democrats_republican_president...
482,133,"democratic, democrats, democratic party, socia...",5,2017-08-24 13:10:59.249999872,133_biden_democrats_republican_president...
546,133,"gillespie, virginia, ed gillespie, democrats, ...",4,2017-10-10 15:49:11.100000000,133_biden_democrats_republican_president...
633,133,"democratic, biden, democrats, republican, senate",5,2017-11-26 18:27:22.950000128,133_biden_democrats_republican_president...
708,133,"joe voted, republicans, republican, democrats,...",2,2018-01-12 21:05:34.800000000,133_biden_democrats_republican_president...
787,133,"mcdaniel, roger wicker, republican, chris mcda...",9,2018-02-28 23:43:46.649999872,133_biden_democrats_republican_president...


In [24]:
topic_model_dge.visualize_topics_over_time(topics_over_time_dge, top_n=10)

## 10. Fit a bertopic model with longformer embeddings

In [60]:
print(f'99% of the bodies are no more than {dfs.body_len.quantile(q = 0.99)} tokens.\n1% ({round(len(dfs)*0.01)} bodies) are more than {dfs.body_len.quantile(q = 0.99)}.\nWe will try Bertopic with longformer embeddings (max sequence length 4096)')
#round(len(dfs)*0.01)

99% of the bodies are no more than 2785.0 tokens.
1% (8984 bodies) are more than 2785.0.
We will try Bertopic with longformer embeddings (max sequence length 4096)


In [42]:
from flair.embeddings import TransformerDocumentEmbeddings
longformer_embedding = TransformerDocumentEmbeddings('allenai/longformer-base-4096')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=694.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=597257159.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words= 'english')

topic_model_lfe = BERTopic(embedding_model=longformer_embedding,
                       umap_model=umap_model, #non-linear dimension reduction
                       hdbscan_model=hdbscan_model, 
                       vectorizer_model=vectorizer_model, # CV is the base for c-tfidf
                       calculate_probabilities = False, #Whether to calculate the topic probabilities. This could slow down the extraction of topics if you have many documents (> 100_000).
                       verbose = True, #Changes the verbosity of the model, Set to True if you want to track the stages of the model.
                       nr_topics = None,
                       low_memory = True,
                       top_n_words = 10) #If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics that have a similarity of at least 0.9, do not maps all others.

In [49]:
topics_lfe, _ = topic_model_lfe.fit_transform(dfs_overtime.body.tolist())

9600it [15:14, 10.50it/s]
2021-05-23 05:22:03,126 - BERTopic - Transformed documents to Embeddings
2021-05-23 05:22:21,307 - BERTopic - Reduced dimensionality with UMAP
2021-05-23 05:22:21,920 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [50]:
### save the default model
topic_model_lfe.save("default_model_lfe_body", save_embedding_model=True)
!cp -r default_model_lfe_body "gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/"

In [61]:
topic_model_lfe = BERTopic.load("gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/default_model_lfe_body") 

In [55]:
topic_freq = topic_model_lfe.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0]-1} topics")

3364 documents have not been classified
The other 6236 documents are 105 topics


In [52]:
topic_model_lfe.get_topic_info()[:30]

Unnamed: 0,Topic,Count,Name
0,-1,3364,-1_president_people_year_government
1,22,965,22_people_year_said_according
2,26,705,26_npr_audio available_later today_npr thanks
3,25,295,25_image caption_mr_people_says
4,70,255,70_says_minister_al_saturday
5,32,194,32_percent_index_growth_year
6,96,191,96_tax_senate_republicans_federal
7,29,191,29_district_chicago_city_county
8,28,169,28_county_police_according_chicago
9,84,131,84_police_officers_virginia beach_officer


In [53]:
topic_model_lfe.find_topics("brexit", top_n=5) 

([92, 31, 33, 50, 65],
 [0.9986931429432713,
  0.9984306303634206,
  0.9984165854196112,
  0.9983861012038544,
  0.9983833200825])

In [54]:
topic_model_lfe.visualize_topics()