In [2]:
import time 
import os

folder_path = 'data/train'

start_time = time.time()

file_list = [file for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]

text_list =[]

for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        text_list.append(text)
        
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Reading in 15,000 files takes: {elapsed_time} seconds")

Reading in 15,000 files takes: 482.0945544242859 seconds


In [3]:
print(len(text_list))

15594


In [11]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

#Transformer...384-dimensional sentence embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') 

#Dimensionality reduction - compresses into 3 dimensions (default 2).
#UMAP attempts to preserves distances to kth nearest neightbor while lowering dimension
umap_model = UMAP(n_neighbors= 5, 
                  n_components = 3,
                  min_dist = 0.5)

#HDBSCAN(hierarachical, density-based method) clusters lower dimensional vectors
hdbscan_model = HDBSCAN(min_cluster_size = 15,               
                 min_samples = 20,
                 gen_min_span_tree = True,
                 prediction_data = True)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stopwords = list(stopwords.words('english')) 

#This removes the stopwords and vectorizes with 
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=stopwords)

In [19]:
from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5, #number of words per topic
    language='english',
    calculate_probabilities=True,
    verbose=True
)

topics, probs = model.fit_transform(text_list[0:int(len(text_list)/10)])

Batches:   0%|          | 0/49 [00:00<?, ?it/s]

2023-11-17 19:09:53,844 - BERTopic - Transformed documents to Embeddings
2023-11-17 19:10:00,824 - BERTopic - Reduced dimensionality
2023-11-17 19:10:01,083 - BERTopic - Clustered reduced embeddings


In [20]:
model.get_topics()

{-1: [('generic_name', 0.19743353420334447),
  ('club', 0.17171906651562496),
  ('students', 0.054228742964319385),
  ('join', 0.049427970505093274),
  ('journalist', 0.045625183020389835)],
 0: [('electoral', 0.11491753410568423),
  ('college', 0.09857218767572809),
  ('vote', 0.09830426250457425),
  ('president', 0.08334545869516245),
  ('electors', 0.05646420492706264)],
 1: [('cars', 0.07448082601151547),
  ('driving', 0.07023998341438298),
  ('driverless', 0.06876105267820037),
  ('car', 0.06653232513931968),
  ('driver', 0.042479738090068914)],
 2: [('advice', 0.09940851635819362),
  ('people', 0.06541980625923335),
  ('ask', 0.059437040817544036),
  ('multiple', 0.05416046731741139),
  ('opinions', 0.05101686218187073)],
 3: [('online', 0.0749562659434095),
  ('students', 0.06729811392611),
  ('classes', 0.06006539937333354),
  ('school', 0.05868585418441691),
  ('home', 0.05570253566303238)],
 4: [('emotions', 0.07324914141209914),
  ('facial', 0.06073788576672948),
  ('technol

In [21]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6,-1_generic_name_club_students_join,"[generic_name, club, students, join, journalist]","[""I've waited a long time for this!"" exclaimed..."
1,0,190,0_electoral_college_vote_president,"[electoral, college, vote, president, electors]","[The Electoral College is not a place, but a p..."
2,1,174,1_cars_driving_driverless_car,"[cars, driving, driverless, car, driver]",[Driverless cars have positives which they hol...
3,2,174,2_advice_people_ask_multiple,"[advice, people, ask, multiple, opinions]",[When you seek advice do you ask multiple peop...
4,3,156,3_online_students_classes_school,"[online, students, classes, school, home]",[Some schools offer distance learning as a opt...
5,4,110,4_emotions_facial_technology_computer,"[emotions, facial, technology, computer, could]","[In Nick D'Alto's article, "" Making Mona Lisa ..."
6,5,100,5_car_cars_usage_air,"[car, cars, usage, air, pollution]",[Cars have been an issue to our community for ...
7,6,90,6_venus_author_planet_earth,"[venus, author, planet, earth, dangers]",[Although Venus has a reputation for being dif...
8,7,87,7_sports_average_grade_grades,"[sports, average, grade, grades, school]","[Dear, Principal\n\nIf u change the school pol..."
9,8,86,8_face_mars_landform_aliens,"[face, mars, landform, aliens, natural]",[The Face of Mars\n\nWhat if the Face on Mars ...


In [16]:
model.visualize_barchart()

In [22]:
model.visualize_topics()