In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups


categories = ['comp.graphics', 'sci.med']

print(f"Loading dataset for categories: {categories}")
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))


df = pd.DataFrame(data.data, columns=["text"])
df['topic_label'] = [data.target_names[t] for t in data.target] 


original_count = len(df)
df = df[df['text'].str.strip().str.len() > 30].reset_index(drop=True)


print(f"\nSETUP SUCCESSFUL!")
print(f"Original rows: {original_count}")
print(f"Cleaned rows:  {len(df)}")
print("-" * 30)

display(df.head())




Loading dataset for categories: ['comp.graphics', 'sci.med']

SETUP SUCCESSFUL!
Original rows: 1178
Cleaned rows:  1138
------------------------------


Unnamed: 0,text,topic_label
0,\n\tIt depends on what kind of the polygons. \...,comp.graphics
1,ML> From: libman@hsc.usc.edu (Marlena Libman)\...,sci.med
2,I have posted a DOS MPEG decoder/player to alt...,comp.graphics
3,"\nGee, what do I do? My LDL is only 50-60. (a...",sci.med
4,Are complex bio-medical images available ...,comp.graphics


In [4]:
import re

def clean_text(text):

    text = text.lower()

    text = re.sub(r'\S+@\S+', ' ', text) # remove emails
    text = re.sub (r'http\S+',' ' ,text) # remove hyper links
    text = re.sub(r'[^a-zA-Z0-9.,?!]', ' ', text) # remove punctuations
    text = re.sub(r'\s+', ' ', text).strip() # remove redundant spaces and formatting mistakes*

    return text

print("Cleaning data...")

df['clean_text'] = df['text'].apply(clean_text)

# compare results

print("Cleaning Complete!")
print("-"*50)
print(f"Before cleaning - {df['text'][0][:100]}")
print(f"\nAfter cleaning - {df['clean_text'][0][:100]} ")



Cleaning data...
Cleaning Complete!
--------------------------------------------------
Before cleaning - 
	It depends on what kind of the polygons. 
	Convex - simple, concave - trouble, concave with loop(s

After cleaning - it depends on what kind of the polygons. convex simple, concave trouble, concave with loop s inside  


In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

print ("Initialisation..")

vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2",vectorizer_model=vectorizer_model, min_topic_size=15)

print("Fitting the model...")
topics, probs = topic_model.fit_transform(df['clean_text'])

print("Fitting Completed!")

freq = topic_model.get_topic_info()
print(f"Found {len(freq)-1} topic(s) (Topic -1 = outliers)")
display(freq.head(10))

Initialisation..
Fitting the model...
Fitting Completed!
Found1 topics (Topic -1 = outliers)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,576,0_image_graphics_jpeg_file,"[image, graphics, jpeg, file, data, files, ima...",[i have posted disp135.zip to alt.binaries.pic...
1,1,562,1_health_don_people_medical,"[health, don, people, medical, like, use, just...","[cut here volume 6, number 10 april 20, 1993 !..."


In [15]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

top_topics = topic_model.get_topic_info().head(3)

for index, row in top_topics.iterrows():
    topic_id = row['Topic']

    if topic_id == -1:
        continue

    topic_name = row['Name']

    example_post = row['Representative_Docs'][0]

    result = sentiment_pipeline(example_post[:512])[0] # 512 BERT limit

    print(f"\nTopic:{topic_name}")
    print(f"\nExample Post : {example_post[:100]}...")
    print(f"Sentiment: {result['label']} (Confidence: {result['score']})")




Device set to use cpu



Topic:0_image_graphics_jpeg_file

Example Post : i have posted disp135.zip to alt.binaries.pictures.utilities you may distribute this program freely ...
Sentiment: NEGATIVE (Confidence: 0.9979368448257446)

Topic:1_health_don_people_medical

Example Post : cut here volume 6, number 10 april 20, 1993 ! ! ! health info com network ! ! medical newsletter ! e...
Sentiment: NEGATIVE (Confidence: 0.9900362491607666)


In [17]:

fig = topic_model.visualize_barchart(top_n_topics=10)

fig.write_html("my_topic_map.html")
print(" Map saved as 'my_topic_map.html'")

 Map saved as 'my_topic_map.html'
