## 0. install and load packages

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount=True)

Mounted at /content/gdrive


In [None]:
!pip install bertopic #[all]
!pip install sentence_transformers
!pip install umap-learn
!pip install hdbscan

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np
import pandas as pd
import pickle
from bertopic import BERTopic


## 1. load and combine datasets

In [None]:
dataset_list = ['df_v3_1_200000','df_v3_200000_400000','df_v3_400000_600000','df_v3_600000_800000','df_v3_800000_850000','df_v3_850000_905573']
dfs = [pd.read_csv('gdrive/My Drive/nlp-tm-proj/PolUSA/PolUSA-data/{}.csv'.format(dataset)) for dataset in dataset_list]
dfs = pd.concat(dfs)
dfs.reset_index(drop = True, inplace = True)

  if self.run_code(code, result):


In [None]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905573 entries, 0 to 905572
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         905573 non-null  object 
 1   date_publish               905572 non-null  object 
 2   outlet                     905573 non-null  object 
 3   headline                   905547 non-null  object 
 4   lead                       840629 non-null  object 
 5   body                       905572 non-null  object 
 6   authors                    732635 non-null  object 
 7   domain                     905571 non-null  object 
 8   url                        905571 non-null  object 
 9   political_leaning          905571 non-null  object 
 10  headline_len               905545 non-null  float64
 11  body_len                   905571 non-null  float64
 12  headline_lang              905547 non-null  object 
 13  body_lang                  90

In [None]:
# only keep the rows with conditions: 
## 1. headline is English 2. headline is meaningful and not null
dfs = dfs[(dfs.if_eng_headline_nonstrict==1)&(dfs.if_meaningful_headline==1)&(dfs.headline==dfs.headline)]

In [None]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 890940 entries, 0 to 905572
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         890940 non-null  object 
 1   date_publish               890940 non-null  object 
 2   outlet                     890940 non-null  object 
 3   headline                   890940 non-null  object 
 4   lead                       826858 non-null  object 
 5   body                       890939 non-null  object 
 6   authors                    722692 non-null  object 
 7   domain                     890939 non-null  object 
 8   url                        890939 non-null  object 
 9   political_leaning          890939 non-null  object 
 10  headline_len               890939 non-null  float64
 11  body_len                   890939 non-null  float64
 12  headline_lang              890940 non-null  object 
 13  body_lang                  89

In [None]:
# all rows are unique
len(dfs.drop_duplicates())

890940

In [None]:
### use only first 10000 rows for testing ###
dfs = dfs[0:10000]

## 2. Fit BerTopic model for headlines

- prepare custom embedding, umap, hdbscan and vectorizor models with the paramters below
- use Bertopic to wrap them
- fit a Topic model

In [None]:
# Prepare custom models
embedding_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

HBox(children=(FloatProgress(value=0.0, max=244715968.0), HTML(value='')))




In [None]:
# Pass the custom models to BERTopic
topic_model = BERTopic(embedding_model = embedding_model,
                       umap_model=umap_model, 
                       hdbscan_model=hdbscan_model, 
                       vectorizer_model=vectorizer_model,
                       calculate_probabilities = False, #Whether to calculate the topic probabilities. This could slow down the extraction of topics if you have many documents (> 100_000).
                       verbose = False, #Changes the verbosity of the model, Set to True if you want to track the stages of the model.
                       nr_topics = None) #If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics that have a similarity of at least 0.9, do not maps all others.

In [None]:
#topics, probabilities = topic_model.fit_transform(headlines[0:10000], embeddings[0:10000])
topics, probabilities = topic_model.fit_transform(dfs.headline.tolist())


## 3. Save and load the default TM model for headline

The default model for headline:
- using embeddings by Sbert: distilbert-base-nli-stsb-mean-tokens
- using the parameters above for umap, sdbscan and CV

In [None]:
### save the default model
topic_model.save("default_model_headline", save_embedding_model=True)
!cp -r default_model_headline "gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/"

In [None]:
### load the default model
topic_model = BERTopic.load("gdrive/My Drive/nlp-tm-proj/PolUSA/saved-models-and-embeddings/default_model_headline") #load saved model

## 4. Visualize the default model

In [None]:
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0]-1} topics")

4264 documents have not been classified
The other 5736 documents are 182 topics


### 4.1 Get topics and the number of docs within each topic

In [None]:
topic_freq[:10]

Unnamed: 0,Topic,Count
0,-1,4264
1,136,203
2,125,186
3,138,149
4,179,141
5,129,137
6,2,131
7,77,120
8,141,108
9,172,96


In [None]:
print(f"There are {topic_freq['Count'].iloc[1]} documents that are talking about topic ID {topic_freq['Topic'].iloc[1]}")

There are 203 documents that are talking about topic ID 136


### 4.2 Get keywords and their c-TF-IDF scores for a certain topic

- Return top n words for a specific topic and their c-TF-IDF scores

In [None]:
# keywords and their probs in Topic ID 127
topic_model.get_topic(topic_freq['Topic'].iloc[1])

[('brexit', 0.056884530441472535),
 ('uk', 0.031217653400060358),
 ('eu', 0.02845105942897282),
 ('ivan rogers', 0.017181544677012762),
 ('britain', 0.014501503323523691),
 ('northern ireland', 0.010997578293958976),
 ('hard brexit', 0.01071434873380965),
 ('ambassador eu', 0.010019450828152438),
 ('eu envoy', 0.008662702021503344),
 ('england', 0.008590772338506381)]

### 4.3 Get information about each topic including its id, frequency, and name



In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,4264,-1_house_russia_donald trump_donald
1,136,203,136_brexit_uk_eu_ivan rogers
2,125,186,125_mln brief_pct stake_sell_shares
3,138,149,138_repeal_obamacare repeal_repeal obamacare_r...
4,179,141,179_hacking_russian hacking_russia hacking_hacks
...,...,...,...
170,124,11,124_overhaul_considers overhaul_strategy remod...
169,51,11,51_president farewell_emotional president_fare...
168,147,11,147_unemployment_wage_inequality primary_worki...
181,117,10,117_fear safety_officers fear_safety pew_reluc...


### 4.4 Find topics most similar to a search_term

- Creates an embedding for search_term and compares that with the topic embeddings. The most similar topics are returned along with their similarity values.

- The search_term can be of any size but since it compares with the topic representation it is advised to keep it below 5 words.

- This method can only be used if you did not use custom embeddings.

In [None]:
topic_model.find_topics("brexit", top_n=5) 

([127, 126, 96, 64, 18],
 [0.7487480013966736,
  0.48288976267228834,
  0.41055808881343714,
  0.40183953772843384,
  0.39185822375897406])

### 4.5 Visualize topics

In [None]:
topic_model.visualize_topics()

In [None]:
# need to set: calculate_probabilities = True for visualize_distribution
topic_model.visualize_distribution(probabilities[0])

## 5. Update dataset:

- Add columns obtained from topic_model to dfs
- columns added: 'topic_id', 'top_10_keywords_with_score', 'top_10_keywords', 'Topic','Count', 'Name'

In [None]:
topics, probabilities = topic_model.fit_transform(dfs.headline.tolist())

In [None]:
dfs['topic_id'] = topics
dfs['top_10_keywords_with_score'] = dfs.apply(lambda x: topic_model.get_topic(x.topic_id), axis = 1)

In [None]:
# extract all keywords from kw_score_column and tranfer them to an added column kw_column. Then, update the dataframe. 
def extract_keywords(df, kw_score_column, kw_column):
    df[kw_column] =''
    for i in df.index.tolist():
        df[kw_column][i] = [items[0] for items in df[kw_score_column][i]]
    return df

In [None]:
dfs = extract_keywords(dfs, 'top_10_keywords_with_score', 'top_10_keywords')
dfs = pd.merge(dfs,topic_model.get_topic_info(), left_on=['topic_id'], right_on=['Topic'], how ='left')

In [None]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          10000 non-null  object 
 1   date_publish                10000 non-null  object 
 2   outlet                      10000 non-null  object 
 3   headline                    10000 non-null  object 
 4   lead                        9032 non-null   object 
 5   body                        10000 non-null  object 
 6   authors                     8674 non-null   object 
 7   domain                      10000 non-null  object 
 8   url                         10000 non-null  object 
 9   political_leaning           10000 non-null  object 
 10  headline_len                10000 non-null  float64
 11  body_len                    10000 non-null  float64
 12  headline_lang               10000 non-null  object 
 13  body_lang                   1000

In [None]:
dfs.sample(3)

Unnamed: 0,id,date_publish,outlet,headline,lead,body,authors,domain,url,political_leaning,headline_len,body_len,headline_lang,body_lang,if_eng_body,if_eng_headline,if_eng_headline_nonstrict,preprocessed_headline,preprocessed_body,if_meaningful_headline,if_meaningful_body,topic_id,top_10_keywords_with_score,top_10_keywords,Topic,Count,Name
6723,3972022,2017-01-09 09:32:22,HuffPost,Twitter Predicts Donald Trump's Response To Me...,'What a loser. No way she could've played Chac...,Meryl Streep’s poised and headline-making take...,Rebecca Shapiro;Senior Editor;The Huffington Post,www.huffingtonpost.com,http://www.huffingtonpost.com/entry/yuge-disap...,LEFT,11.0,686.0,[en:0.9999963968838057],[en:0.9999953485212745],1,1,1,Twitter Predicts Donald Trump's Response To Me...,Meryl Streep’s poised and headline-making take...,1,1,-1,"[(donald trump, 0.0027125665680152585), (donal...","[donald trump, donald, russia, obama, 2016, wh...",-1,4067,-1_donald trump_donald_russia_obama
9320,38929560,2017-01-11 12:47:54,The Guardian,"Britain considering £1,000-a-year levy for ski...",Immigration minister tells peers post-Brexit l...,The government is seriously considering imposi...,Alan Travis,www.theguardian.com,https://www.theguardian.com/uk-news/2017/jan/1...,LEFT,8.0,710.0,[en:0.999995655419109],[en:0.9999969979130261],1,1,1,"Britain considering £1,000-a-year levy for ski...",The government is seriously considering imposi...,1,1,127,"[(brexit, 0.05830464787916251), (uk, 0.0340102...","[brexit, uk, eu, ivan rogers, britain, ambassa...",127,209,127_brexit_uk_eu_ivan rogers
8636,59612426,2017-01-11 00:00:00,Chicago Tribune,Compromising material appears frequently in Ru...,,Blurry video of highly placed men engaging in ...,Tribune News Services,www.chicagotribune.com,http://www.chicagotribune.com/news/nationworld...,UNDEFINED,7.0,642.0,[en:0.9999956406056567],[en:0.9999973371784068],1,1,1,Compromising material appears frequently in Ru...,Blurry video of highly placed men engaging in ...,1,1,163,"[(hacking, 0.059057184187211224), (russian hac...","[hacking, russian hacking, russia hacking, hac...",163,156,163_hacking_russian hacking_russia hacking_hacks


## 6. Update default model: reduce the number of topics to 30

In [None]:
new_topics, new_probs = topic_model.reduce_topics(dfs.headline.tolist(), topics, probabilities, nr_topics=30)

In [None]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name
0,-1,6109,-1_trump_donald_donald trump_obama
1,164,271,164_hacking_russian hacking_trump_russia hacking
2,114,205,114_facebook_facebook live_hate crime_torture ...
3,124,199,124_china_taiwan_chinese_apple
4,131,196,131_repeal_obamacare repeal_replace_repeal oba...
5,112,187,112_orlando_killed_mexico_orlando police
6,56,184,56_nightclub_istanbul_istanbul nightclub_night...
7,123,179,123_mln brief_holdings_pct stake_shares
8,85,177,85_farewell_farewell address_obama farewell_fa...
9,117,158,117_brexit_uk_eu_ivan rogers


In [None]:
topic_model.visualize_topics()