BERTopic Demo
https://maartengr.github.io/BERTopic/

In [None]:
%pip install bertopic
%pip install bertopic[flair, gensim, spacy, use]
%pip install openai

Fetch texts from db and compute topics with BERTopic

In [None]:
from bertopic import BERTopic
from embedding import fetch_entries
db_name = "law_database.db"
texts, uuids = fetch_entries(db_name)

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(texts)

In [1]:
from bertopic import BERTopic
from embedding import fetch_entries_with_embeddings
db_name = "law_database.db"
emedding_model = "all-MiniLM-L6-v2"
texts, uuids, embeddings = fetch_entries_with_embeddings(db_name)
topic_model = BERTopic(embedding_model=emedding_model)
topics, probs = topic_model.fit_transform(texts, embeddings=embeddings)

[nltk_data] Downloading package punkt to /home/luc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


hi dad
hi mom


In [15]:
# topic_model.save("./topic_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=emedding_model)
topic_model.save("temp", serialization="pickle")



In [2]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
topic_model = BERTopic.load("temp")

Insert the topics into the db

In [4]:
from embedding import insert_label
f = topic_model.get_topic_info()
t = f['Name']
topic_names = t.tolist()
print(topic_names)
for topic_name in topic_names:
    insert_label(db_name, topic_name)

['-1_optometry_optometrist_optometric_practitioner', '0_nursing_degree_nurses_baccalaureate', '1_supervision_naturopathic_supervising_supervise', '2_misconduct_unprofessional_discretion_license', '3_residential_residence_facilities_facility', '4_intervention_disciplinary_registered_termination', '5_nursing_requirements_academic_postsecondary', '6_nursing_nurse_committees_committee', '7_optometry_optometrist_ophthalmologist_lenses', '8_compliance_certification_licensees_nursing', '9_renewal_renew_license_renewed', '10_residential_residence_facilities_facility', '11_midwife_midwives_nurse_standardized', '12_committee_registered_appointment_appointed', '13_psychopharmacology_pharmacology_psychologists_psychologist', '14_abortion_midwifery_competency_procedure', '15_optometry_regulations_optometric_optometrists', '16_optometry_optometric_license_optometrist', '17_license_unprofessional_licensure_drug', '18_nursing_enroll_degree_institution', '19_licensure_fee_fees_nursing', '20_postpartum_

In [5]:
topic_model.get_topic(0)

[['nursing', 0.4826834201812744],
 ['degree', 0.3612825870513916],
 ['nurses', 0.3525093197822571],
 ['baccalaureate', 0.3432251811027527],
 ['nurse', 0.329161673784256],
 ['institution', 0.315751314163208],
 ['academic', 0.307522714138031],
 ['postsecondary', 0.2963041067123413],
 ['education', 0.2874205708503723],
 ['associate', 0.2682255506515503]]

Insert the links between the topics and the text into the db

In [7]:
from embedding import store_cluster_link_entry
d = topic_model.get_document_info(texts)
d_doc = d['Document']
d_name = d['Name']
print(len(d_doc))
print(len(d_name))

s_doc = d_doc.tolist()
s_name = d_name.tolist()
for i, doc in enumerate(s_doc):
    store_cluster_link_entry(db_name, doc, s_name[i])


ValueError: All arrays must be of the same length

In [4]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4179,-1_or_of_the_any,"[or, of, the, any, to, in, and, section, this,...",[The board may take action against any license...
1,0,290,0_racing_horse_wagering_meeting,"[racing, horse, wagering, meeting, races, thor...",[The total percentage deducted from wagers at ...
2,1,243,1_pharmacy_pharmacist_drug_drugs,"[pharmacy, pharmacist, drug, drugs, prescripti...",[(a) A pharmacy located in the state may provi...
3,2,200,2_bar_state_court_attorney,"[bar, state, court, attorney, legal, supreme, ...",[(a) In addition to any criminal penalties pur...
4,3,174,3_lien_owner_vehicle_lienholder,"[lien, owner, vehicle, lienholder, claim, work...",[A petition for a release order shall be verif...
...,...,...,...,...,...
270,269,11,269_solar_energy_system_installation,"[solar, energy, system, installation, systems,...",[(a) When reviewing a request to install a sol...
271,270,11,270_loan_rate_reverse_mortgage,"[loan, rate, reverse, mortgage, you, lender, i...",[(a) No increase in interest provided for in a...
272,271,11,271_hemodialysis_dialysis_training_technician,"[hemodialysis, dialysis, training, technician,...",[(a) Except during training under immediate su...
273,272,11,272_viticultural_appellation_wine_origin,"[viticultural, appellation, wine, origin, napa...",[(a) (1) The Legislature finds and declares th...


Chunking Dataset

In [8]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from embedding import fetch_entries_with_embeddings_specific_chunk, store_cluster_link_entry, insert_label
db_name = "law_database.db"
emedding_model = "all-MiniLM-L6-v2"
chunk_size = 5000
chunk, num_chunks = fetch_entries_with_embeddings_specific_chunk(db_name, chunk_size, 1)
print(num_chunks)
representation_model = KeyBERTInspired()
base_model = BERTopic(representation_model=representation_model, min_topic_size=15, embedding_model=emedding_model).fit(chunk[0], embeddings=chunk[2])

f = base_model.get_topic_info()
t = f['Name']
topic_names = t.tolist()
print(topic_names)
for topic_name in topic_names:
    insert_label(db_name, topic_name)

d = base_model.get_document_info(chunk[0])
d_doc = d['Document']
d_name = d['Name']

s_doc = d_doc.tolist()
s_name = d_name.tolist()
for i, doc in enumerate(s_doc):
    store_cluster_link_entry(db_name, doc, s_name[i])

for i in range(2, num_chunks + 1):
    chunk_data, s = fetch_entries_with_embeddings_specific_chunk(db_name, chunk_size, i)
    new_model = BERTopic(representation_model=representation_model, min_topic_size=15, embedding_model=emedding_model).fit(chunk_data[0], embeddings=chunk_data[2])
    updated_model = BERTopic.merge_models([base_model, new_model])

    # Let's print the newly discover topics
    nr_new_topics = len(set(updated_model.topics_)) - len(set(base_model.topics_))
    new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
    print("The following topics are newly found:")
    print(f"{new_topics}\n")

    f = new_model.get_topic_info()
    t = f['Name']
    topic_names = t.tolist()
    print(topic_names)
    for topic_name in topic_names:
        insert_label(db_name, topic_name)

    d = new_model.get_document_info(chunk_data[0])
    d_doc = d['Document']
    d_name = d['Name']

    s_doc = d_doc.tolist()
    s_name = d_name.tolist()
    for i, doc in enumerate(s_doc):
        store_cluster_link_entry(db_name, doc, s_name[i])
    
    base_model = updated_model

topic_model = base_model

127
['-1_optometry_optometric_optometrist_license', '0_psychologist_psychologists_psychology_doctoral', '1_nursing_degree_nurses_nurse', '2_residential_residence_facilities_facility', '3_postpartum_midwifery_neonatal_births', '4_supervision_naturopathic_supervising_supervise', '5_optometrist_ophthalmologist_ocular_lens', '6_misconduct_unprofessional_license_discretion', '7_intervention_disciplinary_registered_termination', '8_optometry_optometrist_ophthalmologist_lenses', '9_prescription_prescribing_physician_physicians', '10_residential_residence_facilities_facility', '11_midwife_midwives_nurse_standardized', '12_misdemeanor_conviction_misconduct_alleging', '13_abortion_midwifery_competency_procedure', '14_nursing_enrollment_degree_accreditation', '15_licensure_fee_fees_license', '16_optometry_optometric_optometrist_optometrists', '17_optometry_optometrist_ophthalmologist_ocular', '18_nursing_preceptor_clinical_consultant', '19_optometry_optometrist_ophthalmologist_ocular', '20_renewa

Create a hierarchical map of topics

In [None]:
from bertopic import BERTopic
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model_cluster = BERTopic(hdbscan_model=hdbscan_model)

from embedding import fetch_entries
db_name = "law_database.db"
texts, uuids = fetch_entries(db_name)
topics_cluster, probs_cluster = topic_model_cluster.fit_transform(texts)

topic_model_cluster.get_topic_info()

Create UMAP projection form hierarchical map of topics

In [None]:
from embedding import fetch_entries
db_name = "law_database.db"
docs, uuids = fetch_entries(db_name)
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

# Prepare embeddings
# docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)

# Train BERTopic
topic_model = BERTopic().fit(docs, embeddings)

# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
print(reduced_embeddings)


Encode the UMAP projection into a JSON file 
- umap_x
- umap_y
- topic
- document

In [None]:
import json
print(len(topic_model.get_document_info(docs)))
print(len(reduced_embeddings))

document_info = topic_model.get_document_info(docs)
topics = document_info['Name']
documents = document_info['Document']

# Assuming `reduced_embeddings` is your UMAP data and it's a list of lists or a NumPy array
data = [{"umap_x": float(point[0]), "umap_y": float(point[1]), "topic": topic, "document": document} for point, topic, document in zip(reduced_embeddings, topics, documents)]

# Save the data as a JSON file
with open("umap-data.json", "w") as f:
    json.dump(data, f)

Encode the UMAP projection into JSON file with linked text_uuid
- umap_x
- umap_y
- item_id

In [None]:
import json

# Get the document information from BERTopic
d = topic_model.get_document_info(texts)
d_doc = d['Document']
d_name = d['Name']

# Convert to list if necessary
s_doc = d_doc.tolist() if hasattr(d_doc, 'tolist') else d_doc
s_name = d_name.tolist() if hasattr(d_name, 'tolist') else d_name

# Create a list of dictionaries for JSON output
data = []
for i, doc in enumerate(s_doc):
    # Use the UUID corresponding to the document's text
    law_entry_uuid = uuids[i]
    data.append({
        "umap_x": float(reduced_embeddings[i][0]),
        "umap_y": float(reduced_embeddings[i][1]),
        "item_id": law_entry_uuid  # Add the law_entry_uuid here
    })

# Save the data as a JSON file
with open("umap-data.json", "w") as f:
    json.dump(data, f)

Future Work:
Update Topic Representation after Training
https://maartengr.github.io/BERTopic/getting_started/topicrepresentation/topicrepresentation.html

Online Topic Modeling with River:
https://maartengr.github.io/BERTopic/getting_started/online/online.html

Multi Aspect Representations:
https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html

Zero Shot Topic Modeling
https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html

Partial Labels:
https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html#partial-labels

In [8]:
from bertopic import BERTopic
from embedding import fetch_entries_with_user_labels
db_name = "law_database.db"
texts, labels, label_bert_id, uuids = fetch_entries_with_user_labels(db_name)

# Print all the values of labels and their uuids where the label is not equal to -1
for label, uuid in zip(labels, uuids):
    if label_bert_id != -1:
        print(f"Label: {label}, UUID: {uuid}")

if all(label_bert_id == -1 for label in labels):
    print("error no user labels")
else:
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(texts, y=label_bert_id)

Label: damages, UUID: 279ea29d-f551-424b-9015-563a3f2d8f4c
Label: damages, UUID: 2881e76d-59b3-4397-99ff-e249b789a1d6
Label: damages, UUID: 2ed722c7-85d9-402e-89b0-e2fc3cb5a316


ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from embedding import fetch_entries_with_user_lables_and_embeddings_chunk, store_cluster_link_entry, insert_label
db_name = "law_database.db"
emedding_model = "all-MiniLM-L6-v2"
chunk_size = 5000
### chunk := texts, labels, uuids, label_bert_id, embeddings
chunk, num_chunks = fetch_entries_with_user_lables_and_embeddings_chunk(db_name, chunk_size, 1)
print(num_chunks)
representation_model = KeyBERTInspired()
base_model = BERTopic(representation_model=representation_model, min_topic_size=15, embedding_model=emedding_model).fit_transform(chunk[0], embeddings=chunk[4], y=chunk[3])

f = base_model.get_topic_info()
t = f['Name']
topic_names = t.tolist()
print(topic_names)
for topic_name in topic_names:
    insert_label(db_name, topic_name)

d = base_model.get_document_info(chunk[0])
d_doc = d['Document']
d_name = d['Name']

s_doc = d_doc.tolist()
s_name = d_name.tolist()
for i, doc in enumerate(s_doc):
    store_cluster_link_entry(db_name, doc, s_name[i])

for i in range(2, num_chunks + 1):
    chunk_data, s = fetch_entries_with_embeddings_specific_chunk(db_name, chunk_size, i)
    new_model = BERTopic(representation_model=representation_model, min_topic_size=15, embedding_model=emedding_model).fit_transform(chunk_data[0], embeddings=chunk_data[4], y=chunk[3])
    updated_model = BERTopic.merge_models([base_model, new_model])

    # Let's print the newly discover topics
    nr_new_topics = len(set(updated_model.topics_)) - len(set(base_model.topics_))
    new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
    print("The following topics are newly found:")
    print(f"{new_topics}\n")

    f = new_model.get_topic_info()
    t = f['Name']
    topic_names = t.tolist()
    print(topic_names)
    for topic_name in topic_names:
        insert_label(db_name, topic_name)

    d = new_model.get_document_info(chunk_data[0])
    d_doc = d['Document']
    d_name = d['Name']

    s_doc = d_doc.tolist()
    s_name = d_name.tolist()
    for i, doc in enumerate(s_doc):
        store_cluster_link_entry(db_name, doc, s_name[i])
    
    base_model = updated_model

topic_model = base_model

In [7]:
topic_model.get_topic_info()
topic_model.get_topics()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4130,-1_or_of_the_to,"[or, of, the, to, any, in, section, and, that,...",[In addition to the remedy provided for in Sec...
1,0,292,0_racing_horse_wagering_meeting,"[racing, horse, wagering, meeting, races, thor...",[The total percentage deducted from wagers at ...
2,1,206,1_bar_state_attorney_court,"[bar, state, attorney, court, legal, supreme, ...",[(a) In addition to any criminal penalties pur...
3,2,179,2_lien_owner_vehicle_lienholder,"[lien, owner, vehicle, lienholder, claim, work...",[The lien created by the last preceding sectio...
4,3,176,3_medical_physician_surgeon_physicians,"[medical, physician, surgeon, physicians, surg...",[(a) Any person who does not immediately quali...
...,...,...,...,...,...
276,275,11,275_direct_withheld_retention_contractor,"[direct, withheld, retention, contractor, desi...",[(a) If a direct contractor has withheld a ret...
277,276,10,276_competition_induce_subsidy_unlawful,"[competition, induce, subsidy, unlawful, discr...",[It is unlawful for any manufacturer to pay or...
278,277,10,277_child_internet_licensees_disclose,"[child, internet, licensees, disclose, informa...",[(a) (1) In addition to publishing the summary...
279,278,10,278_grant_grantee_delivered_interpreted,"[grant, grantee, delivered, interpreted, deliv...",[A grant cannot be delivered to the grantee co...


LLM generated labels:
https://maartengr.github.io/BERTopic/getting_started/representation/llm#zephyr-mistral-7b

In [None]:
%pip install ctransformers
%pip install --upgrade git+https://github.com/huggingface/transformers

In [None]:
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from bertopic.representation import TextGeneration
from bertopic import BERTopic

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/zephyr-7B-alpha-GGUF",
    model_file="zephyr-7b-alpha.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=50,
    hf=True
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)

prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""

# Text generation with Zephyr
zephyr = TextGeneration(generator, prompt=prompt)
representation_model = {"Zephyr": zephyr}

# Topic Modeling
topic_model = BERTopic(representation_model=representation_model, verbose=True)

In [None]:
from embedding import fetch_entries
db_name = "law_database.db"
texts, uuids = fetch_entries(db_name)

topics, probs = topic_model.fit_transform(texts)

In [None]:
topic_model.get_topic_info()
topic_model.get_topics()