In [None]:
from IPython.display import clear_output
!pip install bertopic


Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [None]:
!pip install colorama
!pip install comm

Collecting comm
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Downloading comm-0.2.2-py3-none-any.whl (7.2 kB)
Installing collected packages: comm
Successfully installed comm-0.2.2


In [None]:
# Importing dependencies

import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import  stopwords
import string
import spacy
import pprint
from colorama import Fore, Back, Style

from bertopic import BERTopic
from bertopic.representation import ZeroShotClassification
from comm import create_comm



# <span style="color:#e74c3c;"> Reading </span> Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
business_data = pd.read_csv('/content/business_data.csv')
entertainment_data = pd.read_csv('/entertainment_data.csv')
sports_data = pd.read_csv('/content/sports_data.csv')
technology_data = pd.read_csv('/content/technology_data.csv')

# merging all data, shuffling and dropping several columns
full_data = pd.concat([business_data, education_data, entertainment_data, sports_data, technology_data], axis = 0)
full_data = full_data.sample(frac = 1).reset_index(drop= True)
full_data.drop(['headlines','description','url'], axis = 1, inplace = True)
full_data.head()

Unnamed: 0,content,category
0,The Chromebook lineup of notebooks powered by ...,technology
1,Schools in Madhya Pradesh will carry out their...,education
2,You are probably acutely aware of how climate ...,technology
3,The Indian Institute of Management (IIM) Kozhi...,education
4,Artificial intelligence is set to transform th...,technology


# <span style="color:#e74c3c;"> Preprocessing </span>

In [None]:
%%time
# preprocessing functions
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 717 ms, sys: 77.9 ms, total: 795 ms
Wall time: 804 ms


In [None]:
def text_preprocessing(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\\W',' ',text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [None]:

def drop_stopwords(text):
    dropped = [word for word in text.split() if word not in stop_words]
    final_text = ' '.join(dropped)
    return final_text

In [None]:
def lemmatization(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [None]:

def delete_one_characters(text):
    deleted = [word if len(word)>1 else "" for word in text.split()]
    final_text = ' '.join(deleted)
    return final_text

In [None]:
def full_preprocessing_pipeline(text):
    text = text_preprocessing(text)
    text = drop_stopwords(text)
    text = lemmatization(text)
    text = delete_one_characters(text)
    return text

data['preprocessed_content'] = data['content'].apply(full_preprocessing_pipeline)

In [None]:
data.head()

Unnamed: 0,content,category,preprocessed_content
0,The Chromebook lineup of notebooks powered by ...,technology,chromebook lineup notebook power google empowe...
1,Schools in Madhya Pradesh will carry out their...,education,school madhya pradesh carry academic function ...
2,You are probably acutely aware of how climate ...,technology,probably acutely aware climate change wreak ha...
3,The Indian Institute of Management (IIM) Kozhi...,education,indian institute management iim kozhikode soon...
4,Artificial intelligence is set to transform th...,technology,artificial intelligence set transform workplac...


# <span style="color:#e74c3c;"> BERTopic </span> Model

In [None]:
# candidate_topics from original dataset
TOPIC_NUM = 10
candidate_topics = ['sports', 'technology','entertainment','education','business']
representation_model = ZeroShotClassification(candidate_topics, model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")

topic_model = BERTopic(nr_topics=TOPIC_NUM, zeroshot_min_similarity = 0.7, representation_model=representation_model, verbose=True)

Device set to use cpu


# <span style="color:#e74c3c;"> Training </span>

In [None]:
topics, probs = topic_model.fit_transform(data['preprocessed_content'].values)

2025-05-05 17:22:55,687 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2025-05-05 17:35:48,435 - BERTopic - Embedding - Completed ✓
2025-05-05 17:35:48,439 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-05 17:36:40,366 - BERTopic - Dimensionality - Completed ✓
2025-05-05 17:36:40,369 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-05 17:36:40,863 - BERTopic - Cluster - Completed ✓
2025-05-05 17:36:40,864 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-05 17:36:42,673 - BERTopic - Representation - Completed ✓
2025-05-05 17:36:42,676 - BERTopic - Topic reduction - Reducing number of topics
2025-05-05 17:36:42,711 - BERTopic - Representation - Fine-tuning topics using representation models.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
2025-05-05 17:37:09,769 - BERTopic - Representation - Completed ✓
2025-05-05 17:37:09,777 - BERTopic - Topic reduction - Reduced

# <span style="color:#e74c3c;"> Topic </span> Results

In [None]:
# topic no -1 for the outlier topic
# count -> clustering samples

freq = topic_model.get_topic_info()
freq

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3273,-1_say_india_also_film,"[say, india, also, film, indian, sign, year, l...",[end day one series promise plenty thrill twis...
1,0,1535,0_business___,"[business, , , , , , , , , ]",[equity benchmark index sensex nifty close hig...
2,1,1439,1_education___,"[education, , , , , , , , , ]",[student allegedly die suicide kota year city ...
3,2,1152,2_entertainment___,"[entertainment, , , , , , , , , ]",[steady run box office shah rukh khan‘s dunki ...
4,3,1022,3_technology___,"[technology, , , , , , , , , ]",[artificial intelligence make stride across in...
5,4,970,4_sports___,"[sports, , , , , , , , , ]",[former skipper mithali raj feel india ’s chan...
6,5,382,5_sports___,"[sports, , , , , , , , , ]",[tariq panja andrew dasthe premier league accu...
7,6,189,6_space_mission_moon_isro,"[space, mission, moon, isro, crew, launch, lun...",[big year space exploration momentum busy year...
8,7,27,7_sports___,"[sports, , , , , , , , , ]",[russia ’s path send team paris olympics next ...
9,8,11,8_abduljabbar_nba_james_laker,"[abduljabbar, nba, james, laker, lebron, recor...",[lebron james inches close scale summit nba ’s...


In [None]:
topic_model.visualize_barchart(top_n_topics=TOPIC_NUM, height=200)

# <span style="color:#e74c3c;"> Intertopic Distance </span> Map

In [None]:
topic_model.visualize_topics()

# <span style="color:#e74c3c;"> Topic Similarity </span> Maxrix

In [None]:
topic_model.visualize_heatmap(n_clusters=5, top_n_topics=TOPIC_NUM)

# <span style="color:#e74c3c;"> Test Topic Modelling</span> Pipeline

In [None]:
def create_end2end_topic_prediction(topic_model, df, similarity_threshold:float, top_n_topic:int) -> None:
    """
    @topic_model : BERTopic model
    @df : raw dataframe
    @similarity_threshold : topic similarity threshold
    @top_n_topic : top n topic

    """

    raw_df = df.sample(1)
    raw_df['preprocessed_content'] = raw_df['content'].apply(text_preprocessing).apply(drop_stopwords).apply(lemmatization).apply(delete_one_characters)
    preprocessed_text = raw_df['preprocessed_content'].values[0]

    print('Content :\n'+Fore.BLUE + preprocessed_text + Style.RESET_ALL)
    print('\nOriginal Category : '+Fore.GREEN + raw_df['category'].values[0] + Style.RESET_ALL)

    #calculating topics and similarities
    similar_topics, similarity = topic_model.find_topics(preprocessed_text, top_n=top_n_topic)

    #printing topics
    for i in range(top_n_topic):
        if (similarity[i] >= similarity_threshold) & (similar_topics[i] != -1):
            print(Fore.RED + '\nTopic No: {0} with topic similarity is {1:.5f}'.format(similar_topics[i], similarity[i]) + Style.RESET_ALL)
            print(topic_model.get_topic(similar_topics[i]))
            print("--"*30)

In [None]:
# random topic prediction
create_end2end_topic_prediction(topic_model, full_data, 0.3, 3)

Content :
[34myou ’re look new tablet budget great deal happen india moment we ’ve round good tablet discount across xiaomi realme oneplus device read detailsredmi padxiaomi ’s affordable redmi pad start rs right micom hdfc bank card user score rs icici customer get rs instant discount via net banking[0m

Original Category : [32mtechnology[0m
[31m
Topic No: 3 with topic similarity is 0.55974[0m
[('technology', 0.905058741569519), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0)]
------------------------------------------------------------
[31m
Topic No: 0 with topic similarity is 0.44698[0m
[('business', 0.9860215783119202), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0)]
------------------------------------------------------------


In [None]:
# random topic prediction
create_end2end_topic_prediction(topic_model, full_data, 0.4, 3)

Content :
[34mkia india tuesday unveil update version midsized sport utility vehicle selto aim garner per cent market share domestic passenger vehicle pv segmentthe automaker currently market share around per cent domestic market also aim double exist sale network plus outlet  selto product enter india since journey kia india selto almost identical segment disruptor segment winner new selto confident take forward strong legacy brand selto lead premium rv market  kia india managing director ceo taejin park saida new spirit rise among uscome forward mesmerize inspire presencethe badass rebornknow selto badassbydesign premiere thenextfromkia thenewseltos thebadassreborn movementthatinspire  kia india kiaind july strategic launch key company ’s ambition attain per cent market share soon addedadvertisement“we feel lot growth potential midsuv segment new selto grow premium end it  park notedalso read kia electric vehicle launch india here ’s much costswith refresh look powerful engine segme

In [None]:
# random topic prediction
create_end2end_topic_prediction(topic_model, full_data, 0.2, 3)

Content :
[34mindia ’s oil import set fall eightmonth low july due combination monsoonle fall domestic fuel petroleum product demand low refinery run unit maintenance accord estimate commodity market analytic firm kplerso far july india ’s crude import average million barrel per day bpd around bpd compare june low since november kpler data showedthis story subscriber onlynow subscribe special republic day offer rs nowalready subscriber sign[0m

Original Category : [32mbusiness[0m
[31m
Topic No: 0 with topic similarity is 0.60961[0m
[('business', 0.9860215783119202), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0)]
------------------------------------------------------------
[31m
Topic No: 1 with topic similarity is 0.41478[0m
[('education', 0.9643800258636475), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0), ('', 0)]
------------------------------------------------------------
