In [None]:
from bertopic import BERTopic


In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

## Getting data from kaggle

In [None]:
file_path = "spam_mail_classifier.csv"


In [None]:
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "sahideseker/spam-mail-classifier-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

## Head of data

In [None]:
print("First 5 records:", df.head())

## Data distribution 

In [None]:
df['label'].hist(figsize=(2, 2), bins=2, rwidth=0.9)

# Step 1

## Training BERT

BERTopic works by converting documents into numerical values, called embeddings. This process can be very costly, especially if we want to iterate over parameters. Instead, we can calculate those embeddings once and feed them to BERTopic to skip calculating embeddings each time.

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model  = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(df['email_text'], show_progress_bar=True)


<b> Preventing Stochastic Behavior </b>

In BERTopic, we generally use a dimensionality reduction algorithm to reduce the size of the embeddings. This is done to prevent the curse of dimensionality to a certain degree.
As a default, this is done with UMAP which is an incredible algorithm for reducing dimensional space. However, by default, it shows stochastic behavior which creates different results each time you run it. To prevent that, we will need to set a random_state of the model before passing it to BERTopic.
As a result, we can now fully reproduce the results each time we run the model.

In [None]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

<b>Controlling Number of Topics</b>

There is a parameter to control the number of topics, namely nr_topics. This parameter, however, merges topics after they have been created. It is a parameter that supports creating a fixed number of topics.
However, it is advised to control the number of topics through the cluster model which is by default HDBSCAN. HDBSCAN has a parameter, namely min_cluster_size that indirectly controls the number of topics that will be created.

In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model
                        )
topics, probs = topic_model.fit_transform(df['email_text'])


In [None]:
topic_model.visualize_barchart()

# Step 2

## Improving topics representation

<b>Improving Default Representation</b>

The default representation of topics is calculated through c-TF-IDF. However, c-TF-IDF is powered by the CountVectorizer which converts text into tokens. Using the CountVectorizer, we can do a number of things:
Remove stopwords
Ignore infrequent words
Increase the n-gram range
In other words, we can preprocess the topic <b>representations, after</b> documents are assigned to topics. <b>This will not influence the clustering process in any way.</b>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", 
                                    min_df=0.1, 
                                    max_df=0.9,
                                    ngram_range=(1, 3))

In [None]:
topic_model.update_topics(df['email_text'],
                        vectorizer_model=vectorizer_model               
                       )


In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_topics()

In [None]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech


In [None]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech


In [None]:
# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
#pos_model = PartOfSpeech("en_core_web_sm")
pos_model = PartOfSpeech("en_core_web_trf")



# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.9)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [None]:
topic_model.update_topics(df['email_text'],
                        vectorizer_model=vectorizer_model,            
                        representation_model=representation_model,
                        top_n_words=10
                        )

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.get_topic_info()


In [None]:
tpcs = topic_model.get_topic(1, full=True)
for t in tpcs:
    print(t)
    for pairs in tpcs[t]:
        print(f" {pairs[0]:>30}    {pairs[1]:.2f}")
    print('*'*80)




In [None]:
topic_model.get_topic_info()['Count'].sum()


In [None]:
len(tpcs)


In [None]:
type(tpcs)

In [None]:
tpcs.keys()

In [None]:
tpcs['Main']

In [None]:
tpcs['KeyBERT']

In [None]:
tpcs['MMR']

In [None]:
tpcs['POS']