# **BERTopic - Tutorial**

In [83]:
!pip install bertopic[visualization] --quiet



# **Imports**

In [84]:
import numpy as np
import pandas as pd
from copy import deepcopy
from bertopic import BERTopic
import re

# **Load data**

In [85]:
#@title Default title text
df = pd.read_csv("News.csv")

In [86]:
df.head(10)

Unnamed: 0,news_id,title,content,link,date
0,1,Do chatbots really help you stay productive?,GUEST: When Slack burst onto the workplace sce...,http://venturebeat.com/?p=2141494,1/1/2017
1,2,Spanish social advertising company Adsmurai ra...,Barcelona-based social advertising company Ads...,http://venturebeat.com/?p=2141069,1/1/2017
2,3,HTC: No Vive 2 at CES,I\u2019d wager most people who bought the HTC ...,http://venturebeat.com/?p=2141559,1/1/2017
3,4,Chinese firms reportedly ordered to pay Disney...,(Reuters) &#8212;\xa0A Shanghai court ordered ...,http://venturebeat.com/?p=2141698,1/1/2017
4,5,AWS sees growth in database migrations,Public cloud market leader Amazon Web Services...,http://venturebeat.com/?p=2141375,1/1/2017
5,6,My New Year’s resolution is to delete Twitter ...,OPINION: I can&#8217;t do this anymore. I don&...,http://venturebeat.com/?p=2140938,1/1/2017
6,7,Google’s AI assistant has 5 New Year’s resolut...,Google Assistant and the smart speaker Google ...,http://venturebeat.com/?p=2141103,1/1/2017
7,8,MacBook Pro review: Lovable despite the short ...,REVIEW: The 2016 MacBook Pro is scandalous. Re...,http://venturebeat.com/?p=2139459,1/1/2017
8,9,5 deep learning startups to follow in 2017,If artificial intelligence (AI) hadn&#8217;t h...,http://venturebeat.com/?p=2135001,1/1/2017
9,10,How a bot can help with airline compensation,GUEST: I was talking to Eric Bahn from 500 Sta...,http://venturebeat.com/?p=2141490,1/1/2017


In [108]:
df['title'].replace("[^a-zA-Z]|[0-9]"," ",regex=True, inplace=True)
df['content'].replace("[^a-zA-Z|[0-9]]"," ",regex=True, inplace=True)


In [109]:
df.head(5)

Unnamed: 0,news_id,title,content,link,date
0,1,dochatbotsreallyhelpyoustayproductive,guest when slack burst onto the workplace scen...,http://venturebeat.com/?p=2141494,1/1/2017
1,2,spanishsocialadvertisingcompanyadsmurairaisesm...,barcelona-based social advertising company ads...,http://venturebeat.com/?p=2141069,1/1/2017
2,3,htcnoviveatces,i\u2019d wager most people who bought the htc ...,http://venturebeat.com/?p=2141559,1/1/2017
3,4,chinesefirmsreportedlyorderedtopaydisneypixarf...,(reuters 8212;\xa0a shanghai court ordered two...,http://venturebeat.com/?p=2141698,1/1/2017
4,5,awsseesgrowthindatabasemigrations,public cloud market leader amazon web services...,http://venturebeat.com/?p=2141375,1/1/2017


In [110]:
docs = list(df.loc[:,'content'].values)

In [101]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

df['title'] = [normalize_text(s) for s in df['title']]

In [102]:
df['content'] = [normalize_text(s) for s in df['content']]

In [120]:
docs[:5]

['guest when slack burst onto the workplace scene employees rejoiced finally there was a way to chat with one another without having to send a dreaded email or worse get up and actually go chat with your coworker face-to-face thanks to slack and a handful of other messaging platforms businesses could easily communicate across teams using&#160;[&#8230 n',
 'barcelona-based social advertising company adsmurai has received u20ac4 million 4.2 million in a second round of funding led by venture capital firm axon partners group with participation from banc sabadell through its program bstartup10 and enisa a spanish government-funded financing group launched in 2014 by marc elena otto w\\xfcst and juan antonio robles adsmurai specializes&#160;[&#8230 n',
 'i\\u2019d wager most people who bought the htc vive love the unit but wish a new version would bring key improvements a slimmer design and lighter cord a better fit\\xa0for\\xa0the face and more ergonomic controllers without hard-to-reach g

# **Creating Topics**

In [112]:
model = BERTopic(language="english")

In [113]:
topics, probs = model.fit_transform(docs)

# We can then extract most frequent topics:

In [114]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,3357
1,0,434
2,1,360
3,2,277
4,3,190
...,...,...
170,169,10
171,170,10
172,171,10
173,172,10


# Get Individual Topics

In [115]:
model.get_topic(10)

[('blizzard', 0.038769442807338836),
 ('teambased', 0.02267300884521457),
 ('blizzard8217s', 0.016665005468745917),
 ('overwatch8217s', 0.013600867575962454),
 ('moba', 0.01059979434331948),
 ('teams', 0.010375158822480317),
 ('esports', 0.008258677436706493),
 ('multiplayer', 0.007528133220858659),
 ('warcraft', 0.007178007415425032),
 ('team', 0.00645540549216663)]

In [116]:
model.get_topic(2)

[('car', 0.023232622893728377),
 ('cars', 0.020469450231181175),
 ('vehicles', 0.017975223235634673),
 ('tesla', 0.015479603034235463),
 ('vehicle', 0.011036423568695568),
 ('driving', 0.009690165968823761),
 ('automotive', 0.006722753939437723),
 ('automakers', 0.005841756848695148),
 ('drive', 0.005118852013730836),
 ('driverless', 0.004761218211259489)]

In [117]:
model.get_topic(5)

[('silicon', 0.026305914469081194),
 ('startups', 0.010872142055891494),
 ('cities', 0.010841896712322458),
 ('entrepreneurs', 0.009420125276177426),
 ('startup', 0.00836137291793107),
 ('jobs', 0.008060640107504343),
 ('entrepreneurship', 0.006874634824458434),
 ('francisco', 0.006047336659558896),
 ('hubs', 0.005384327320589175),
 ('hub', 0.004540273920665369)]

# **Visualize Topics**

In [118]:
model.visualize_topics()