# **BERTopic - Tutorial**

In [6]:
!pip install bertopic



# **Imports**

In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# **Load data**

In [3]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
link = 'https://drive.google.com/file/d/1Ys6MhCs7IcQNin6uaqQDPGcRCg0ZU2pX/view?usp=sharing'

In [5]:
# to get the id part of the file
id = link.split("/")[-2]
 
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('Tweets_Feb.csv') 
 
df = pd.read_csv('Tweets_Feb.csv')
print(df)

                          id  \
0        1499172700924911621   
1        1499172692599181314   
2        1499172692158779392   
3        1499172692007735298   
4        1499172691026186246   
...                      ...   
1975542  1497109112966070277   
1975543  1497109112768938000   
1975544  1497109111389011977   
1975545  1497109111183491076   
1975546  1497109111124770858   

                                                      text  \
0        First Ukraine City Falls as Russia Strikes Mor...   
1        @JeanCASTEX STOP RUSSIAN AGGRESSION AGAINST #U...   
2        @JustinTrudeau @CyrilRamaphosa So the Donbass ...   
3        Ukraine's St. Javelin vs Russia's Improvised A...   
4        Russia, time to stand the fuck down! Prayers f...   
...                                                    ...   
1975542  @verdiKat @jelo2510 @TomTugendhat If you send ...   
1975543  Russian state media published pictures of Khan...   
1975544  @armano Signs/strat of war going on thru pande..

In [6]:
df.head(3)

Unnamed: 0,id,text,created_at,author_id,in_reply_to_user_id,lang,author,entities,referenced_tweets,media,geo
0,1499172700924911621,First Ukraine City Falls as Russia Strikes Mor...,2022-03-02T23:59:59.000Z,1001070136671985664,,,,,,,
1,1499172692599181314,@JeanCASTEX STOP RUSSIAN AGGRESSION AGAINST #U...,2022-03-02T23:59:57.000Z,1498700335191318533,,,,,,,
2,1499172692158779392,@JustinTrudeau @CyrilRamaphosa So the Donbass ...,2022-03-02T23:59:57.000Z,1011752751154413568,,,,,,,


In [9]:
sample_df = df.sample(frac=0.5,random_state=1)

In [10]:
docs = list(sample_df.loc[:, "text"].values)

In [11]:
docs[:5]

['Footage of outgoing Russian MLRS rocket fire reportedly from #kherson region, nova kakhovka, firing west towards #Ukrainians \n\n#RussianArmy #Russia Nigerians In Russia #Ukraine #worldwar3 https://t.co/dwyqdhlDxt',
 'This is where it gets really messy. I bet Ukraine will make Russia pay for every inch here. Good luck to them https://t.co/ifmzcV8gA3',
 '😱😱😱\n\nBelarus preparing to join Russian invasion of Ukraine, U.S. official says\n\nhttps://t.co/xMdbld9Mhd',
 "@7rashing @BioEd2 @earlbrowncarguy @KyivIndependent The rules are clear, you don't execute surrendering soldiers. The point of this is so the other side also honour this, what do you think Russia will do when Ukraine starts killing POWs?",
 "@CyberEagle1989 I mean, the issue here is that it's partially happening because Putin still resents Ukraine's independence from Russia so this is part of the actual war effort."]

# **Creating Topics**

In [12]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=10)
model = BERTopic(vectorizer_model=vectorizer_model,low_memory=True,language="english", calculate_probabilities=False, verbose=True)

In [13]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/30868 [00:00<?, ?it/s]

2022-05-15 19:45:49,335 - BERTopic - Transformed documents to Embeddings
2022-05-16 07:10:08,200 - BERTopic - Reduced dimensionality
2022-05-16 07:16:34,163 - BERTopic - Clustered reduced embeddings


In [14]:
model.save('first_model_Feb')

  self._set_arrayXarray(i, j, x)


In [15]:
my_model = BERTopic.load("first_model_Feb")

# We can then extract most frequent topics:

In [16]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,439145
1,0,25357
2,1,4038
3,2,3700
4,3,3193
...,...,...
10783,10905,10
10784,10906,10
10785,10907,10
10786,10908,10


# Get Individual Topics

In [17]:
model.get_topic(0)

[('join nato', 0.001382748339725985),
 ('ukraine join', 0.0010475878655331978),
 ('joining nato', 0.001020489188732399),
 ('ukraine joining', 0.0009942989718066983),
 ('nato nato', 0.0009273557487067001),
 ('member nato', 0.0008562134336251787),
 ('nato russia', 0.000851835354686734),
 ('nato member', 0.0008279579166498719),
 ('joining', 0.0008276801119682078),
 ('joined nato', 0.0008053112832194102)]

In [18]:
model.get_topic(2)

[('africans', 0.007907703312249116),
 ('racism', 0.0070654493992159985),
 ('racist', 0.00674199031360143),
 ('african', 0.005703427979331217),
 ('black people', 0.005295111291552287),
 ('africa', 0.0043395492210143586),
 ('black', 0.003800183619496223),
 ('blacks', 0.00337581771636592),
 ('white people', 0.003314410053968756),
 ('ukraine racist', 0.0030622110099799526)]

In [19]:
model.get_topic(14)

[('india', 0.004833746408636559),
 ('india support', 0.004819420145583724),
 ('india russia', 0.0035373142765239922),
 ('supported india', 0.0031985264519276196),
 ('ukraine india', 0.003067690518528861),
 ('india india', 0.0030669027376819804),
 ('russia india', 0.0028911838398051545),
 ('indias', 0.0026327137168601087),
 ('india supporting', 0.0024976376254268505),
 ('zelenskyyua narendramodi', 0.0024276963958069844)]

# **Visualize Topics**

In [20]:
model.visualize_topics(top_n_topics=20)

In [23]:
model.visualize_barchart(top_n_topics=20)

In [21]:
dftopic = pd.DataFrame(model.get_topics())
dftopic.to_csv('data.csv')

In [22]:
new_topics, new_probs = model.reduce_topics(docs, topics, nr_topics=20)

2022-05-16 08:18:23,308 - BERTopic - Reduced number of topics from 10912 to 21


In [24]:
model.visualize_barchart(top_n_topics=20)

In [25]:
model.visualize_topics()

In [26]:
model.visualize_hierarchy()