## Imports

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
# IDK, but importing BERTopic without somthing from torch ends with error
# I think it's reated to broken depndencies with dask in datamapplot library or
# internal BERTopic problems because it depends on datamapplot too
import torch
from bertopic import BERTopic
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import OpenAI, MaximalMarginalRelevance

import openai
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

import os
import gc
import pickle
import warnings

Error occuring without importing PyTorch:
```
ImportError: /home/denisalpino/.local/lib/python3.10/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12
```

## Configure environment

In [3]:
warnings.simplefilter("ignore", Warning)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("{} device is available".format(device))

cuda device is available


Load API-key for using GPT4o-mini as a labeling assistant and configuring prompts

In [4]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

SYSTEM_PROMPT = """
You are a helpful, respectful and honest expert labeling assistant for financial topics. Under NO CIRCUMSTANCES should your topic labels include:
- Names of companies, persons, currencies, or news sources
- Any possessive forms (’s)
- Abbreviations/initialisms that resolve to a specific person or organization

Your output MUST:
- Be exactly in the format “topic: <label>”
- Use only conceptual, sectoral, geographic or thematic terms
- If topic relates to an industry/sector: use “topic: <Industry>”
- If topic relates to a region use: “topic: <Region>”
- If topic relates to a financial or analytical theme use: “topic: <Theme>”
- If the key point of the topic is not industry or geography but a financial/analytical pattern (e.g. SWOT, Dividend Policy, Conference Call), always choose a method/topic, even if the papers describe specific sectors.
- Adhere strictly to the examples below

# Good examples:
- topic: Dividend Policy
- topic: Latin America
- topic: Hedge Funds
- topic: SWOT Analysis
- topic: Monetary Policy
- topic: Semiconductors
- topic: Mining Exploration
- topic: Investor Conference
- topic: Return on Equity

# Bad examples:
- topic: Nvidia Earnings (contains the company name)
- topic: Trump Tariffs (contains the person's name)
- topic: Zacks Industry Analysis (contains a news source)
- topic: Buffet Strategy (contains the person's name)
- topic: Financial Performance (overgeneralization)
"""

USER_PROMPT = """
# Instructions:
1. Focus solely on the conceptual content of the excerpts, not on specific entities.
2. Produce a short single topic label in the exact format:
   topic: <your label here>
3. The label must be one of:
   - An industry or sector (e.g., Aviation, Semiconductors)
   - A geographic market or region (e.g., Eastern Europe, Asia)
   - A specialized financial theme (e.g., Shareholder Ownership, Conference Call)
   - A general macro topic (e.g., ESG, Cybersecurity, Monetary Policy)
4. Follow system rules on qualifiers and theme-over-sector priority.
5. Topic names cannot be repeated. If the topic is similar to the previous one, find and reflect in the title its distinctive characteristic
6. Do NOT include any names of companies, people, currencies, or news outlets.

In general, the current topic is described by the following key terms extracted from a group of financial news: [KEYWORDS]
"""

Set random state for extraction the same items from corpus

In [5]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

Add to basic english stop words some high frequently occuring sources

In [6]:
STOP_WORDS = list(ENGLISH_STOP_WORDS) + ["zacks", "simply", "wall", "st", "motley", "fool", "researchandmarkets", "gurufocus", "gf"]

## Data Loading

Load full texts, timestamps and precalculeted embeddings

In [None]:
# Get precalculated 768-dim embeddings, articles and their timestamps
df = (
    pd.read_parquet(
        path="../../data/sample/articles.parquet",
        columns=["text", "datetime"]
    )
    .reset_index(drop=True)
)
docs = df["text"]
timestamps = df["datetime"]
embeddings = np.load('../../data/sample/embeddings_l2.npy')

Load trained intermediate and 2D embeddings and obtained labels

In [11]:
with open("models/v2/umap.pkl", "rb") as f:
    reduced_embeddings = pickle.load(f).embedding_
with open("models/v2/mapper.pkl", "rb") as f:
    reduced_embeddings_2d = pickle.load(f).embedding_
with open("models/v2/hdbscan.pkl", "rb") as f:
    labels = pickle.load(f).labels_

## Topic Model Training

Initialize a new class as a plug for UMAP in BERTopic pipeline as embeddings is pecalculeted so it isnt necessary to retrain UMAP because of getting different result caused by UMAP stochastic behavior

In [12]:
class Dimensionality:
    """Class for pre-calculated reduced embeddings"""
    def __init__(self, reduced_embeddings):
        self.reduced_embeddings = reduced_embeddings

    def fit(self, X):
        return self

    def transform(self, X):
        return self.reduced_embeddings

In [13]:
# Embedding model besed on ModernBERT and fine-tuned on STS task
embedding_model = SentenceTransformer(
    "Alibaba-NLP/gte-modernbert-base",
    device=device,
    model_kwargs=dict(attn_implementation="flash_attention_2")
)
# Basic vectorizer to create bag-of-words, that will be consist of
# terms from 1 to 2 words appeared 25 times at least
vectorizer_model = CountVectorizer(
    stop_words=STOP_WORDS,
    ngram_range=(1, 2),
    min_df=25
)
# c-TF-IDF algorithm with tf-normaization by sqare and BM25 weighting
ctfidf_model = ClassTfidfTransformer(
    bm25_weighting=True,
    reduce_frequent_words=True
)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Configuring representation fine-tuning pipeline with MaximalMarginalRelevance for extracting key terms that will be obtained by GPT4o-mini

In [27]:
mmr = MaximalMarginalRelevance(diversity=0.35, top_n_words=20)
gpt = OpenAI(
    openai.OpenAI(api_key=OPENAI_API_KEY),
    model="gpt-4o-mini",
    system_prompt=SYSTEM_PROMPT,
    prompt=USER_PROMPT,
    chat=True
)
representation_pipe = {"GPT4o-mini": [mmr, gpt]}

Initialize plugs for main models because we already trained it and get embeddings with labels

In [17]:
umap_model = Dimensionality(reduced_embeddings)
hdbscan_model = BaseCluster()

Now lets train our topic model

In [28]:
topic_model = BERTopic(
    embedding_model=embedding_model,           # Step 1 - Extract embeddings (precalculated)
    umap_model=umap_model,                     # Step 2 - Reduce dimensionality (precalculated)
    hdbscan_model=hdbscan_model,               # Step 3 - Cluster reduced embeddings (precalculated)
    vectorizer_model=vectorizer_model,         # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                 # Step 5 - Extract topic words
    representation_model=representation_pipe,  # Step 6 - Fine-tune topic representations by MMR + GPT4o-mini
    verbose=True,
    top_n_words=30                             # Important! We take that much terms for better MMR results
).fit(docs, embeddings=embeddings, y=labels)

2025-06-25 18:54:27,011 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-25 18:54:27,023 - BERTopic - Dimensionality - Completed ✓
2025-06-25 18:54:27,200 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-25 18:54:27,216 - BERTopic - Cluster - Completed ✓
2025-06-25 18:54:27,250 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 74/74 [01:03<00:00,  1.17it/s]
2025-06-25 18:59:31,863 - BERTopic - Representation - Completed ✓


Set generated by GPT4o-mini labels as main custom name of each topic

In [41]:
topics = [label[0] for label in topic_model.get_topic_info()["GPT4o-mini"]]
topic_model.set_topic_labels(topics)

Check is it all right in general

In [196]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,GPT4o-mini,Representative_Docs
0,-1,50114,-1_stocks_buy_health_000,Technology Stocks,"[stocks, buy, health, 000, world, companies, s...",[Technology Stocks],[TRX Gold Reports Third Quarter 2024 Results\n...
1,0,3824,0_thank_gaap_think_operator,Non-GAAP Measures,"[thank, gaap, think, operator, adjusted, quest...",[Non-GAAP Measures],[Participants\nGina Gunning; Chief Legal Offic...
2,1,3475,1_cagr_usd_market research_asia pacific,Asia-Pacific Packaging Industry,"[cagr, usd, market research, asia pacific, mar...",[Asia-Pacific Packaging Industry],[The Fluorosurfactants Market is driven by sev...
3,2,3430,2_ai_security_cloud_solution,Cybersecurity Solutions,"[ai, security, cloud, solution, solutions, pla...",[Cybersecurity Solutions],[Prisma SASE 3.0 extends Zero Trust to unmanag...
4,3,2991,3_patients_clinical_treatment_phase,Oncology Therapeutics,"[patients, clinical, treatment, phase, trial, ...",[Oncology Therapeutics],"[Inovio Pharmaceuticals, Inc. (NASDAQ:INO) Q3 ..."
...,...,...,...,...,...,...,...
69,68,388,68_dividend_dividend yield_payout_payout ratio,Dividend Analysis,"[dividend, dividend yield, payout, payout rati...",[Dividend Analysis],[As global markets navigate a period of cautio...
70,69,385,69_bankruptcy_stores_chapter 11_locations,Retail Bankruptcy,"[bankruptcy, stores, chapter 11, locations, lo...",[Retail Bankruptcy],[Two retail giants and a department store chai...
71,70,383,70_insiders_ebit_reportable_watchlist,Earnings Report Analysis,"[insiders, ebit, reportable, watchlist, inside...",[Earnings Report Analysis],[Investors are often guided by the idea of dis...
72,71,372,71_analysts_collective_assessment_analysts for...,Analysts Forecasts and Projections,"[analysts, collective, assessment, analysts fo...",[Analysts Forecasts and Projections],"[In its upcoming report, American Tower (AMT) ..."


## Saving model

Save topic model with embedding one and c-TF-IDF matrix

In [251]:
topic_model.save("models/v2/topic_model", serialization="pytorch", save_embedding_model=True, save_ctfidf=True)

Load it again to check is it all right

In [4]:
topic_model = BERTopic.load("models/v2/topic_model")

