**Install BERTopic**


In [1]:
!pip install bertopic
#!pip install -U sentence-transformers


Collecting bertopic
  Downloading bertopic-0.10.0-py2.py3-none-any.whl (58 kB)
[?25l[K     |█████▋                          | 10 kB 18.9 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 23.4 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 28.5 MB/s eta 0:00:01[K     |██████████████████████▍         | 40 kB 26.9 MB/s eta 0:00:01[K     |████████████████████████████    | 51 kB 26.6 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 4.9 MB/s 
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 6.8 MB/s 
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.4 MB/s 
[?25hCollecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 37.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting re

**Get the cleaned data (all tweets)**

In [2]:
import pandas as pd
data = pd.read_csv('allData') #load your data here!

targets = []
for i in data['ijoy']:
  if i == 1:
    targets.append("joy")
  elif i == 0:
    targets.append("no joy")

classes = targets

#Example
#from sklearn.datasets import fetch_20newsgroups
#data = fetch_20newsgroups(subset='train',  remove=('headers', 'footers', 'quotes'))

tweets = []
for tweet in data['clean_tweet']:
  tweets.append(str(tweet))
docs = tweets


Create Vocabulary

In [3]:
# create dictionary of terms with valence, arousal, and dominance rankings for vectorizer vocab

file_name = 'NRC-VAD-Lexicon.txt'

terms = {}
va = []
ar = []
do = []
counter=0 

# read words file and store valence, arousal, and dominance in a dictionary
with open(file_name, 'r') as f:
    lines=f.readlines()
    
    for row in lines:
        row_=row.split("\t")
        try:
            terms[row_[0]] = counter
            va.append(float(row_[1]))
            ar.append(float(row_[2]))
            do.append(float(row_[3]))
            counter +=1
        except:
            print(row_)
            

**Build the model here**

In [4]:
# This can take some time

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

#vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") 
#You can use the same vectorizer from before!
vectorizer_model = CountVectorizer(vocabulary=terms, ngram_range=(1,2), stop_words="english")
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
sentence_model = SentenceTransformer("all-mpnet-base-v2")

topic_model = BERTopic(language="english",
                       top_n_words=15,
                       nr_topics=14,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       embedding_model=sentence_model
                       )

topics, probabilities = topic_model.fit_transform(docs)

topic_model.get_topic_info()



Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

  idf = np.log((avg_nr_samples / df)+1)
  idf = np.log((avg_nr_samples / df)+1)


Unnamed: 0,Topic,Count,Name
0,-1,15442,-1_joy_love_amp_like
1,0,357,0_game_football_team_season
2,1,278,1_baseball_game_series_joy
3,2,268,2_photo_district_block_yoga
4,3,222,3_jesus_lord_god_peace
5,4,192,4_crying_joy_cried_tattoo
6,5,187,5____
7,6,179,6_christmas_merry_holiday_family
8,7,179,7_garden_raw_chef_joy
9,8,168,8_avenue_north_west_street


In [5]:
topic_model.get_topic(topic_model.get_topic_freq().iloc[1].Topic)

[('game', 0.10316486195741909),
 ('football', 0.05194184358389449),
 ('team', 0.04914216560115068),
 ('season', 0.049044308770325555),
 ('coach', 0.03673428675205574),
 ('play', 0.03444885222457116),
 ('defense', 0.030454179140507416),
 ('like', 0.030254838513254702),
 ('sports', 0.02984174390515697),
 ('joy', 0.029489255005027616),
 ('bowl', 0.028902110323458612),
 ('state', 0.028423842942302064),
 ('offense', 0.025247195104686652),
 ('super', 0.025154709913521672),
 ('good', 0.02385619527121871)]

**Visualize Topics**

In [6]:
topic_model.visualize_topics()

**Visualize Topics using barchart**

In [7]:
topic_model.visualize_barchart(n_words=10, top_n_topics=14, height=500)

**Visualize Topic Similarity**

In [8]:
topic_model.visualize_heatmap()

**Visualize Topics per Class**

In [9]:
topics_per_class = topic_model.topics_per_class(docs, topics, classes=classes)
topic_model.visualize_topics_per_class(topics_per_class)