## Initialisation

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"


In [2]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

categories = ['comp.graphics', 'sci.med']

print(f"Loading dataset for categories: {categories}")
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))


df = pd.DataFrame(data.data, columns=["text"])
df['topic_label'] = [data.target_names[t] for t in data.target] 


original_count = len(df)
df = df[df['text'].str.strip().str.len() > 30].reset_index(drop=True)


print(f"\nSETUP SUCCESSFUL!")
print(f"Original rows: {original_count}")
print(f"Cleaned rows:  {len(df)}")
print("-" * 30)

display(df.head())

Loading dataset for categories: ['comp.graphics', 'sci.med']

SETUP SUCCESSFUL!
Original rows: 1178
Cleaned rows:  1138
------------------------------


Unnamed: 0,text,topic_label
0,\n\tIt depends on what kind of the polygons. \...,comp.graphics
1,ML> From: libman@hsc.usc.edu (Marlena Libman)\...,sci.med
2,I have posted a DOS MPEG decoder/player to alt...,comp.graphics
3,"\nGee, what do I do? My LDL is only 50-60. (a...",sci.med
4,Are complex bio-medical images available ...,comp.graphics


In [3]:
import re

def clean_text(text):

    text = text.lower()

    text = re.sub(r'\S+@\S+', ' ', text) # remove emails
    text = re.sub (r'http\S+',' ' ,text) # remove hyper links
    text = re.sub(r'[^a-zA-Z0-9.,?!]', ' ', text) # remove punctuations
    text = re.sub(r'\s+', ' ', text).strip() # remove redundant spaces and formatting mistakes*

    return text

print("Cleaning data...")

df['clean_text'] = df['text'].apply(clean_text)

# compare results

print("Cleaning Complete!")
print("-"*50)
print(f"Before cleaning - {df['text'][0][:100]}")
print(f"\nAfter cleaning - {df['clean_text'][0][:100]} ")



Cleaning data...
Cleaning Complete!
--------------------------------------------------
Before cleaning - 
	It depends on what kind of the polygons. 
	Convex - simple, concave - trouble, concave with loop(s

After cleaning - it depends on what kind of the polygons. convex simple, concave trouble, concave with loop s inside  


In [4]:
from bertopic import BERTopic

print ("Initialisation..")

topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", min_topic_size=15)

print("Fitting the model...")
topics, probs = topic_model.fit_transform(df['clean_text'])

print("Fitting Completed!")

freq = topic_model.get_topic_info()
print(f"Found{len(freq)-1} topics (Topic -1 is outliers)")
display(freq.head(10))

  from .autonotebook import tqdm as notebook_tqdm


Initialisation..
Fitting the model...


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


: 