# Example using BerTopic

## Data processing

Lets import the required libraries

In [1]:
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import polars as pl
from sentence_transformers import SentenceTransformer
from sklearn.datasets import fetch_20newsgroups
from umap import UMAP
from bertopic import BERTopic

In [3]:
# Load dataset
newsgroups_train = fetch_20newsgroups(subset='train')
docs = np.array(newsgroups_train.data)
# subset the data as to reduce the size of docs
idxs = np.random.randint(0, len(docs), size=200)
docs = docs[idxs]

In [6]:
def initial_clean(docs):
    """
    A collcetion of regexs that clean text
    """
    idx = 57
    # some text cleaning example
    html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
    html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
    email_pattern_regex = r'\S*@\S*\s?'
    num_pattern_regex = r'[0-9]+'
    nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
    postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
    multiple_spaces_regex = r'\s{2,}'
    text = pl.Series(docs).str.strip_chars()
    print("-"*10 +"Raw"+"-"*10 )
    print(text[idx])
    text = text.str.replace_all(html_pattern_regex, ' ')
    text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
    text = text.str.replace_all(email_pattern_regex, ' ')
    text = text.str.replace_all(nums_two_more_regex, ' ')
    text = text.str.replace_all(postcode_pattern_regex, ' ')
    print("-"*10 +"After"+"-"*10 )
    print(text[idx])
    return text

In [7]:
clean_text = initial_clean(docs)
docs = clean_text.to_list()

----------Raw----------
From: franceschi@pasadena-dc.bofa.com
Subject: Re: Gov't break-ins (Re: 60 minutes)
Organization: Bank America Systems Engineering, Pasadena, CA
Lines: 20

On a Los Angeles radio station last weekend, the lawyers for the
family of the MURDERED rancher said that the Los Angeles Sheriff's
Department had an assessment done of the rancher's property before
the raid.

This strongly implies that the sheriff's department wanted the property;
any drugs (which were not found) were only an excuse.

In Viet Nam, Lt Calley was tried and convicted of murder because his
troops, in a war setting, deliberately killed innocent people. It is time
that the domestic law enforcement agencies in this country adhere to
standards at least as moral as the military's.

Greed killed the rancher, possibly greed killed the Davidian children.
Government greed.

It is time to prosecute the leaders who perform these invasions.


Fred Franceschi   (These are my own opinions!)
----------After---

## Embeddings

In [3]:
#| hide
# BERTopic??
# SentenceTransformer??

In [11]:
# choosing BerTopic embeddings
embeddings_name = "BAAI/bge-small-en-v1.5" 
embedding_model = SentenceTransformer(embeddings_name) 
embeddings_out  = embedding_model.encode(sentences=docs, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

### Reduce Document embeddings matrix size

In [12]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=42)
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1, max_df=0.95)
min_docs_slider, max_topics_slider = 5, 50 # min docs required to make topic, max number of topics

## Create a Topic Model

In [13]:
topic_model = BERTopic( embedding_model=embedding_model,
                        vectorizer_model=vectoriser_model,
                        umap_model=umap_model,
                        min_topic_size = min_docs_slider,
                        nr_topics = max_topics_slider,
                        calculate_probabilities=True,
                        verbose = True)
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)

2024-05-30 16:01:07,426 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-30 16:01:11,572 - BERTopic - Dimensionality - Completed ✓
2024-05-30 16:01:11,573 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-30 16:01:11,592 - BERTopic - Cluster - Completed ✓
2024-05-30 16:01:11,593 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-30 16:01:11,721 - BERTopic - Representation - Completed ✓
2024-05-30 16:01:11,722 - BERTopic - Topic reduction - Reducing number of topics
2024-05-30 16:01:11,724 - BERTopic - Topic reduction - Reduced number of topics from 10 to 10


In [14]:
# prob of each topic
probs[0].shape

(9,)

In [15]:
# topic assignment
assigned_topics[:10]

[6, -1, 5, 7, 7, 0, 6, 5, 5, 5]

### Reduce Outliers

In [16]:
# Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")

In [17]:
# Then, update the topics to the ones that considered the new data
topic_model.update_topics(docs, topics=assigned_topics, vectorizer_model = vectoriser_model)
# Tidy up topic label format a bit to have commas and spaces by default
new_topic_labels = topic_model.generate_topic_labels(nr_words=3, separator=", ")
topic_model.set_topic_labels(new_topic_labels)



In [18]:
new_topic_labels

['0, play, shots, team',
 '1, god, jesus, say',
 '2, armenian, armenians, said',
 '3, water, riding, writes article',
 '4, clipper, chip, government',
 '5, ax, ax ax, max',
 '6, dc, development, towers',
 '7, militia, gun, government',
 '8, blood, gordon banks, banks']