In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/My Drive/Colab Notebooks"
file_path = "/content/drive/My Drive/Colab Notebooks/all_text.txt"

'Advanced BERTopic.ipynb'   BERTopic_topic_list.txt
 all_text.txt		   'Copy of cataract_CNN_clf.ipynb'


In [None]:

with open(file_path, 'r', encoding='utf-8') as file:
        all_text = file.read()

In [None]:
text_list = all_text.split('+++')
text_list[:5]

['Some people belive that the so called "face" on mars was created by life on mars. This is not the case. The face on Mars is a naturally occuring land form called a mesa. It was not created by aliens, and there is no consiracy to hide alien lifeforms on mars. There is no evidence that NASA has found that even suggests that this face was created by aliens.\n\nA mesa is a naturally occuring rock formation, that is found on Mars and Earth. This "face" on mars only looks like a face because humans tend to see faces wherever we look, humans are obviously extremely social, which is why our brain is designed to recognize faces.\n\nMany conspiracy theorists believe that NASA is hiding life on Mars from the rest of the world. These people would be very wrong. If NASA found life on Mars, then they would get millions of people\'s attention. NASA\'s budget would increase drasticly, which means that their workers would get paid more. There is no good reason that NASA would hide life on Mars from t

In [None]:
len(text_list)

15595

In [None]:
!pip install bertopic



In [None]:
import nltk.corpus

In [None]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

#Transformer...384-dimensional sentence embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

#Dimensionality reduction - compresses into 3 dimensions (default 2).
#UMAP attempts to preserves distances to kth nearest neightbor while lowering dimension
umap_model = UMAP(n_neighbors= 10,
                  n_components = 3,
                  min_dist = 1)

#HDBSCAN(hierarachical, density-based method) clusters lower dimensional vectors
hdbscan_model = HDBSCAN(min_cluster_size = 10,
                 min_samples = 5,
                 gen_min_span_tree = True,
                 prediction_data = True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = list(stopwords.words('english'))

#This removes the stopwords and vectorizes with
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5, #number of words per topic
    language='english',
    calculate_probabilities=True,
    verbose=True
)

topics, probs = model.fit_transform(text_list)

Batches:   0%|          | 0/488 [00:00<?, ?it/s]

2023-11-22 22:24:39,096 - BERTopic - Transformed documents to Embeddings
2023-11-22 22:24:53,487 - BERTopic - Reduced dimensionality
2023-11-22 22:24:55,392 - BERTopic - Clustered reduced embeddings


In [None]:
model.get_topics()

{-1: [('generic_name', 0.17564663823074853),
  ('team', 0.10413410231351307),
  ('participate', 0.09861484177547639),
  ('activities', 0.06634945505237057),
  ('school', 0.055883060433921626)],
 0: [('electoral', 0.11353156206756654),
  ('vote', 0.10136076037482966),
  ('college', 0.09726761779370809),
  ('president', 0.07912646299620354),
  ('electors', 0.05793086137152826)],
 1: [('activities', 0.06318415426920167),
  ('school', 0.060605266522912545),
  ('students', 0.057389412804660124),
  ('extracurricular', 0.05610689325070386),
  ('sports', 0.056059640941108366)],
 2: [('advice', 0.09857189875524897),
  ('people', 0.06148963466833003),
  ('ask', 0.05709284293306092),
  ('multiple', 0.054486579822594056),
  ('opinions', 0.05441186696731158)],
 3: [('students', 0.07176651815160463),
  ('online', 0.06885781833667295),
  ('school', 0.061033056547639374),
  ('classes', 0.06039170159021019),
  ('home', 0.05711554509357248)],
 4: [('cars', 0.11381974108204956),
  ('driverless', 0.102783

In [None]:
info_df = model.get_topic_info()
info_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_generic_name_team_participate_activities,"[generic_name, team, participate, activities, ...",[Boom! Generic_Name kicks the soccer ball as h...
1,0,1820,0_electoral_vote_college_president,"[electoral, vote, college, president, electors]",[Electoral College is where we pick our state ...
2,1,1649,1_activities_school_students_extracurricular,"[activities, school, students, extracurricular...","[Dear, Principle\n\nI disagree with the new sc..."
3,2,1544,2_advice_people_ask_multiple,"[advice, people, ask, multiple, opinions]",[Why do people ask multiple people for their o...
4,3,1499,3_students_online_school_classes,"[students, online, school, classes, home]",[The idea of online school sounds like a dream...
5,4,1385,4_cars_driverless_car_driver,"[cars, driverless, car, driver, would]",[Driverless cars could have a very postive imp...
6,5,1102,5_emotions_technology_facial_computer,"[emotions, technology, facial, computer, could]",[The Facial Action Coding System is a new soft...
7,6,982,6_car_cars_usage_pollution,"[car, cars, usage, pollution, air]","[Ever since cars were invented, they have wide..."
8,7,928,7_venus_planet_earth_author,"[venus, planet, earth, author, surface]",[Venus is the closest planet to Earth. The pla...
9,8,873,8_summer_project_projects_students,"[summer, project, projects, students, designed]",[Summer break is all about fun and hanging out...


In [None]:
model.topics_[:10]

[10, 4, 1, 6, 3, 13, 7, 3, 6, 12]

In [None]:
clusters = model.topics_
topic_str = ''

for cluster in clusters:
  topic = info_df.iloc[cluster + 1].Representation
  for word in topic:
    topic_str += word + ' '
  topic_str += '+++'

topic_str[:200]

'face mars landform aliens natural +++cars driverless car driver would +++activities school students extracurricular sports +++car cars usage pollution air +++students online school classes home +++sea'

In [None]:
clusters = model.topics_
rep_list = []
for cluster in clusters:
  rep = info_df.iloc[cluster].Representation
  rep_list.append(rep)

rep_list

[['phones', 'cell', 'phone', 'policy', 'school'],
 ['students', 'online', 'school', 'classes', 'home'],
 ['electoral', 'vote', 'college', 'president', 'electors'],
 ['emotions', 'technology', 'facial', 'computer', 'could'],
 ['advice', 'people', 'ask', 'multiple', 'opinions'],
 ['driving', 'phone', 'phones', 'cell', 'texting'],
 ['car', 'cars', 'usage', 'pollution', 'air'],
 ['advice', 'people', 'ask', 'multiple', 'opinions'],
 ['emotions', 'technology', 'facial', 'computer', 'could'],
 ['community', 'service', 'help', 'think', 'students'],
 ['electoral', 'vote', 'college', 'president', 'electors'],
 ['electoral', 'vote', 'college', 'president', 'electors'],
 ['phones', 'cell', 'phone', 'policy', 'school'],
 ['generic_name', 'team', 'participate', 'activities', 'school'],
 ['electoral', 'vote', 'college', 'president', 'electors'],
 ['electoral', 'vote', 'college', 'president', 'electors'],
 ['face', 'mars', 'landform', 'aliens', 'natural'],
 ['emotions', 'technology', 'facial', 'comput

In [None]:
with open("/content/drive/My Drive/Colab Notebooks/BERTopic_topic_list.txt","w") as text_file:
  text_file.write(topic_str)

In [None]:
import pandas as pd

for index, row in model.get_topic_info().iterrows():
  print(row.Representative_Docs)

['Boom! Generic_Name kicks the soccer ball as hard as she can towards the goal. The goaly is surprised but quick on her feet as she thrusts herself towards the ball and catches it before it reaches the net. The referee blows the whistle and that is the end of the game. Generic_Name has not only let herself down, but her team as well. Each team shake hands with each other and leave the field. You have an expression on your face that describes disappointment and anger. Our school\'s principal, Generic_Name then looks at you from the bleachers and calls you over. As you make your way across the field, you keep yourself questioning yourself. What did I do? Am i going to be kicked off the team? Am I not good enough? Generic_Name asks you whether or not you think everybody at school should do at least one activity or sport. Generic_Name disagrees with this statement because students have after school activities they do that include sports, private lessons, and even classes.\n\nSports are a g

In [None]:
model.visualize_barchart()

In [None]:
model.visualize_topics()