In [2]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [3]:
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

### Define all the submodels to insert in bertopic
- embeddings
- dimensionality reduction (PCA, UMAP, ecc)
- clustering
- vectorizer rapresentation
- fine tune represantations


In [11]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans



# modelli scelti uno dietro l'altro

# embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# reduction dim
#umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=1) # prevent stochastic behavior
umap_model = PCA(n_components=5)

#clustering
#hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
hdbscan_model = KMeans(n_clusters=10)

# vectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
ctfidf_model = ClassTfidfTransformer()

# representation models
#representation_model = MaximalMarginalRelevance(diversity=0.3) # to maximize the diversity of the words
representation_model = KeyBERTInspired()

# **Hyperparameters**
In this section, we will go through most important hyperparameters in BERTopic:
* language
* top_n_words = number of words to extract to each topi
* n_gram_range = se voglio parole composte insieme tipo new york considerato come parola unica
* min_topic_size = il numero di doc minimo che deve avere un topic per venire creato
* nr_topics = numero di topic che voglio creare a priori
* calculate_probabilities = prob di ciascun topi per ciascun doc

In [12]:
# put everything in the BERTopic pipeline

topic_model = BERTopic(embedding_model=embedding_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model,
                       language="english",
                      #  top_n_words=10,
                      #  min_topic_size=10,
                       #nr_topics=10,
                       #calculate_probabilities=True,
                       verbose=True)

topics, probs = topic_model.fit_transform(docs[:400])

2025-06-07 05:18:53,073 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2025-06-07 05:19:34,608 - BERTopic - Embedding - Completed ✓
2025-06-07 05:19:34,609 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-07 05:19:34,653 - BERTopic - Dimensionality - Completed ✓
2025-06-07 05:19:34,657 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-07 05:19:34,677 - BERTopic - Cluster - Completed ✓
2025-06-07 05:19:34,690 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-07 05:19:48,428 - BERTopic - Representation - Completed ✓


In [13]:
# topic generati

freq = topic_model.get_topic_info()
freq

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,61,0_key xterm_xterm map_xterm_define key,"[key xterm, xterm map, xterm, define key, gif ...",[Hi.\n\nI use Emacs and I want to customize my...
1,1,59,1_energy conservation_conservation_energy_cost,"[energy conservation, conservation, energy, co...",[Government-Mandated Energy Conservation is Un...
2,2,51,2_amp_battery_decibel_tzeghagrons,"[amp, battery, decibel, tzeghagrons, power, cu...","[\n >\n >: Thus, a deciBell (deci-, l., tenth ..."
3,3,50,3_armenians_amendment_armenian_nato,"[armenians, amendment, armenian, nato, constit...",[\nThe Supreme Court seems to disagree with yo...
4,4,48,4_x11r5_xprinter_ini file_graphics,"[x11r5, xprinter, ini file, graphics, startup ...",[I am working with Visual Basic v2.0 for windo...
5,5,44,5_video card_vga_drivers_driver,"[video card, vga, drivers, driver, processor, ...","[Hello all,\n\nYou, the Net, are my last resor..."
6,6,39,6_scripture_worship_nestorius_christians,"[scripture, worship, nestorius, christians, sa...","[\n\nBrian K., I am pleased with your honesty...."
7,7,27,7_nhl_hockey_playoffs_hitter,"[nhl, hockey, playoffs, hitter, rangers, seaso...","[[Deletions]\n\nGeez, Dal must have slipped so..."
8,8,14,8____,"[, , , , , , , , , ]","[, , \n\n\n]"
9,9,7,9_flyers_nordiques bruins_million flyers_montr...,"[flyers, nordiques bruins, million flyers, mon...","[Yes, I could look it up but I prefer to post ..."


# Save and load the model to use it after trained:

In [24]:
%cd /content/drive/MyDrive/Colab Notebooks/vintage_ai/notebooks

/content/drive/MyDrive/Colab Notebooks/vintage_ai/notebooks


In [25]:
import os

folder_path = "/content/drive/MyDrive/Colab Notebooks/vintage_ai/notebooks/models"

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Cartella '{folder_path}' creata.")
else:
    print(f"Cartella '{folder_path}' già esistente.")

Cartella '/content/drive/MyDrive/Colab Notebooks/vintage_ai/notebooks/models' creata.


In [27]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

topic_model.save(folder_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)



In [28]:
# Load from directory
loaded_model = BERTopic.load(folder_path)

# Visualizzazioni e classificazione finale dei documenti:

In [18]:
# ciascun documento classificato con la sua probabilità di topic

topic_model.get_document_info(docs[:400])

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Representative_document
0,\n\nI am sure some bashers of Pens fans are pr...,7,7_nhl_hockey_playoffs_hitter,"[nhl, hockey, playoffs, hitter, rangers, seaso...","[[Deletions]\n\nGeez, Dal must have slipped so...",nhl - hockey - playoffs - hitter - rangers - s...,False
1,My brother is in the market for a high-perform...,5,5_video card_vga_drivers_driver,"[video card, vga, drivers, driver, processor, ...","[Hello all,\n\nYou, the Net, are my last resor...",video card - vga - drivers - driver - processo...,False
2,\n\n\n\n\tFinally you said what you dream abou...,3,3_armenians_amendment_armenian_nato,"[armenians, amendment, armenian, nato, constit...",[\nThe Supreme Court seems to disagree with yo...,armenians - amendment - armenian - nato - cons...,False
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,5,5_video card_vga_drivers_driver,"[video card, vga, drivers, driver, processor, ...","[Hello all,\n\nYou, the Net, are my last resor...",video card - vga - drivers - driver - processo...,False
4,1) I have an old Jasmine drive which I cann...,4,4_x11r5_xprinter_ini file_graphics,"[x11r5, xprinter, ini file, graphics, startup ...",[I am working with Visual Basic v2.0 for windo...,x11r5 - xprinter - ini file - graphics - start...,False
...,...,...,...,...,...,...,...
395,\n\nEither the government has force available ...,3,3_armenians_amendment_armenian_nato,"[armenians, amendment, armenian, nato, constit...",[\nThe Supreme Court seems to disagree with yo...,armenians - amendment - armenian - nato - cons...,False
396,I'm new to the hardware and with a mandate to ...,4,4_x11r5_xprinter_ini file_graphics,"[x11r5, xprinter, ini file, graphics, startup ...",[I am working with Visual Basic v2.0 for windo...,x11r5 - xprinter - ini file - graphics - start...,False
397,"hi all,\n\nIN SHORT: looking for very fast ass...",4,4_x11r5_xprinter_ini file_graphics,"[x11r5, xprinter, ini file, graphics, startup ...",[I am working with Visual Basic v2.0 for windo...,x11r5 - xprinter - ini file - graphics - start...,False
398,"\nThe ""so sacred it's secret"" explanation is a...",6,6_scripture_worship_nestorius_christians,"[scripture, worship, nestorius, christians, sa...","[\n\nBrian K., I am pleased with your honesty....",scripture - worship - nestorius - christians -...,False


In [16]:
# Reduce n_clusters to a value smaller than the number of topics

# utile per vedere se unire qualche topic

topic_model.visualize_heatmap(n_clusters=9, width=1000, height=1000)

In [17]:
topic_model.visualize_topics()

# TOPIC NAMES FINTE TUNING:

# if i want to include finetuning with llm to have a better rapresantation of the words = giving 1 word for the topic,
# passing key word and representative docs
# posso passarlo direttamente in bertopic o dopo



In [None]:



from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/zephyr-7B-alpha-GGUF",
    model_file="zephyr-7b-alpha.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=50,
    hf=True
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)



In [None]:
prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""

In [35]:
!git status

On branch michelle_branch
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   BERTopic_prova.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mscraping_agent.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [30]:
!git add BERTopic_prova.ipynb

In [32]:
 !git config --global user.email "paganimichelle0499@gmail.com"
 !git config --global user.name "michelle2399"

In [33]:
!git commit -m "add notebook Bertopic"

[michelle_branch abd73a3] add notebook Bertopic
 1 file changed, 1 insertion(+)
 create mode 100644 notebooks/BERTopic_prova.ipynb


In [36]:
!git branch --set-upstream-to=origin/michelle_branch


Branch 'michelle_branch' set up to track remote branch 'michelle_branch' from 'origin'.


In [41]:
!git add .

In [42]:
!git commit -m "Salvo modifiche prima del rebase"

[michelle_branch 69efb8d] Salvo modifiche prima del rebase
 1 file changed, 1 insertion(+), 1 deletion(-)


In [None]:
git pull --rebase origin michelle_branch