# Modelling
### Jumbo data

<hr>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import the libraries

In [None]:
!pip install bertopic

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### Load the data

In [None]:
data = pd.read_excel("/content/drive/MyDrive/test-data/JUMBO-trainingszinnen per intent.xlsx")

In [None]:
data = pd.read_csv("/content/drive/MyDrive/test-data/deeplearning_questions.csv")

In [None]:
data = pd.read_csv("/content/drive/MyDrive/BERTopic+embeddings/lemmatized_compl.csv")

## Sentence Transformers

### Install the sentence transformers

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
# Sentence transformer embedding used
# sentence_model = SentenceTransformer("all-distilroberta-v1")
# sentence_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
# sentence_model = SentenceTransformer("all-mpnet-base-v2")

sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
sentences = data["lemmatized"]

In [None]:
# Encode the sentences
sentence_embedding = sentence_model.encode(sentences)

In [None]:
# Save the embedding
np.save("/content/drive/MyDrive/BERTopic+embeddings/compl-ST-paraphrase-multilingual-MiniLM-L12-v2-with-dlp.npy", sentence_embedding)

In [None]:
# Load the embedding
sentence_embedding = np.load("/content/drive/MyDrive/BERTopic+embeddings/compl-ST-distiluse-base-multilingual-cased-v1-with-dlp.npy")

In [None]:
sentence_embedding.shape

(20972, 768)

## Dimensionality Reduction (UMAP)

> UMAP (Uniform Manifold Approximation and Projection) is a dimensionality reduction technique that can be used to visualize high-dimensional data in lower-dimensional space. It is particularly well-suited for preserving local structure in the data, which can be important for identifying clusters or groups of similar data points.

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install umap-learn

In [None]:
sns.set(style="white", context="notebook", rc={"figure.figsize":(10,6)})

In [None]:
import umap

# apply UMAP to the sentence embeddings
umap_model = umap.UMAP(n_neighbors=15, # was 15
                       min_dist=0.3, # 0.6
                      #  n_components=3, # 3 so it can be plotted in 3D space
                       metric="cosine",
                       low_memory=True)

In [None]:
umap_ = umap_model.fit_transform(sentence_embedding)

In [None]:
# Visualize the results
# Plot the first two dimensions
plt.scatter(umap_model[:,0], umap_model[:,1])
plt.show()

In [None]:
import plotly.express as px

# umap_model[:, 0] represents the x-coordinate
# umap_model[:, 1] represents the y-coordinate
fig = px.scatter(x=umap_model[:, 0], y=umap_model[:, 1])

# Display the plot
fig.show()

In [None]:
#  Old code

# from mpl_toolkits import mplot3d

# fig = plt.figure()
# ax = plt.axes(projection="3d")
# ax.scatter3D(umap_model[:,0], umap_model[:,1], umap_model[:,2])
# plt.show()

import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(
    x=umap_model[:, 0],
    y=umap_model[:, 1],
    z=umap_model[:, 2],
    mode='markers',
    marker=dict(
        size=5,
        opacity=0.8
    )
)])

# Set the layout of the plot
fig.update_layout(scene=dict(
    xaxis=dict(title='X'),
    yaxis=dict(title='Y'),
    zaxis=dict(title='Z')
))

# Display the plot
fig.show()

## Clustering (HDBSCAN/k-Means)

> HDBSCAN is a density-based clustering algorithm that can discover clusters of varying shapes and sizes. It works by first computing the minimum spanning tree of the data, and then clustering the tree nodes based on their density.

In [None]:
!pip install hdbscan

In [None]:
import hdbscan
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=9, # was 15
                                min_samples=5, # was 5
                                # metric="euclidean",
                                # cluster_selection_method="eom",
                                prediction_data=True)

### k-Means

In [None]:
from sklearn.cluster import KMeans

cluster_model = KMeans(n_clusters=60)

## CountVectorizer

In [None]:
!pip install bertopic

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

from bertopic import BERTopic

In [None]:
stopw = ["opop","good","mama","jumbocomdoemee","evening","maag","wish","jerry","test","hsllo","widhia","hema","bolcom","ericht","octa","moeder","please","would","heyhoi","thank","nice","addaccbcb","message","furg","goed","waarmee","gaan","openbestellinggaanpassenbestelling","goedenavond","sanne","goed","vanavond","mogelijk","gaan","timeout","message","gabrin","unsupported","text","werkdag","nieuwjaar","jaar","dag","waarmee","attachment","type","chayenne"]

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopw = stopwords.words("dutch")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Tokenize topics (min_df=3 or min_df=10)
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=stopw)

### LDA (TEST)

Latent Dirichlet Allocation (LDA) is used as a topic modelling technique that can classify text in a document to a particular topic. It uses Dirichlet distribution to find topics for each document model and words for each topic model.

In [None]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

In [None]:
vectorized_data = vectorizer_model.fit_transform(sentences)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
topics = lda_model.fit_transform(vectorized_data)

In [None]:
# Interpret the results
for doc_idx, topic_dist in enumerate(topics):
    top_topics = topic_dist.argsort()[:-3:-1]  # Get the indices of the top topics
    print(f"Document {doc_idx + 1}:")
    for topic_idx in top_topics:
        print(f"Topic {topic_idx}: {lda_model.components_[topic_idx]}")
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 27.63298215]

Document 45994:
Topic 2: [0.10024625 1.73140167 0.1        ... 0.10000094 0.10001208 0.10001328]
Topic 7: [1.00000000e-01 1.00021163e-01 1.00002305e-01 ... 1.00009829e-01
 1.22342391e+02 1.57115994e+01]

Document 45995:
Topic 5: [ 0.10007406  0.10001678 28.09996948 ...  1.31090092  0.10003913
  6.23228109]
Topic 4: [ 0.10001186  0.10004162  0.10000131 ...  0.10001419 18.02096433
 35.62935201]

Document 45996:
Topic 1: [ 0.10000061  0.10001519  0.10000794 ...  0.10003897 89.94020819
 16.90809085]
Topic 5: [ 0.10007406  0.10001678 28.09996948 ...  1.31090092  0.10003913
  6.23228109]

Document 45997:
Topic 4: [ 0.10001186  0.10004162  0.10000131 ...  0.10001419 18.02096433
 35.62935201]
Topic 7: [1.00000000e-01 1.00021163e-01 1.00002305e-01 ... 1.00009829e-01
 1.22342391e+02 1.57115994e+01]

Document 45998:
Topic 8: [ 0.10010775  0.10000614  0.10000186 ...  0.10000262  3.63698591
 10.93756392]
Topic 1: [ 0.10

In [None]:
# Get the top words for each topic

feature_names = vectorizer_model.get_feature_names_out()
num_top_words = 6  # nr of top words to display for each topic

for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")

Topic 0: order, sen, customer, store, sent, delivery
Topic 1: bestelling, betalen, krijgen, ontvangen, code, mail
Topic 2: punt, online, krijgen, staan, extra, winkel
Topic 3: zegel, ontvangen, digitaal, bestelling, post, krijgen
Topic 4: product, bestelling, leveren, ontvangen, bestellen, krijgen
Topic 5: bestelling, bericht, annuleren, bezorgen, komen, staan
Topic 6: prijs, product, vinden, keer, komen, klant
Topic 7: winkel, terug, kopen, product, foto, klacht
Topic 8: account, nieuw, lukken, emailadres, staan, proberen
Topic 9: winkel, vraag, contact, bericht, product, filiaal


## c-TF-IDF

In [None]:
# create topic representation
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

## Representation layer

In [None]:
from bertopic.representation import KeyBERTInspired

# Fine-tune topic representations
representation_model = KeyBERTInspired(top_n_words=2)

In [None]:
sentences = list(data["lemmatized"])

## Maximal Marginal Relevance (MMR)

> When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like "car" and "cars" essentially represent the same information and often redundant.

Reference: https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#partofspeech

In [None]:
from bertopic.representation import MaximalMarginalRelevance

In [None]:
# Diversity varies between 0 and 1. 0 - no diversity; 1 - high diversity
representation_model_mmr = MaximalMarginalRelevance(diversity=0.6)

## BERTopic

In [None]:
from bertopic import BERTopic

In [None]:
topic_model = BERTopic(
    language="multilingual",
    embedding_model=sentence_model,                 # Step 1 - Extract embeddings
    umap_model=umap_model,                          # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,                    # Step 3 - Cluster reduced embeddings
    top_n_words=10,                                 # The number of words per topic to extract
    min_topic_size=20,                              # The minimum size of the topic; Increasing this value will lead to a lower number of clusters/topics.
    nr_topics="auto",                               # Automatic topic reduction
    vectorizer_model=vectorizer_model,              # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                      # Step 5 - Extract topic words
    representation_model=representation_model_mmr,  # Step 6 - Fine-tune topic represenations
    calculate_probabilities=True                    # Enable this to perform soft clustering
)

In [None]:
topics, probs = topic_model.fit_transform(sentences)

In [None]:
# Save the BERTopic model
# topic_model.save("/content/drive/MyDrive/BERTopic+embeddings/bert_model_external-data-dutch-news", serialization="safetensors", save_ctfidf=True, save_embedding_model=sentence_model)

In [None]:
topic_model.get_params(0)

{'calculate_probabilities': False,
 'ctfidf_model': ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True),
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f8f51a1a440>,
 'hdbscan_model': HDBSCAN(min_cluster_size=20, min_samples=8, prediction_data=True),
 'language': None,
 'low_memory': False,
 'min_topic_size': 20,
 'n_gram_range': (1, 1),
 'nr_topics': 'auto',
 'representation_model': MaximalMarginalRelevance(diversity=0.6),
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.3, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(min_df=10, ngram_range=(1, 2),
                 stop_words=['mama', 'jumbocomdoemee', 'evening', 'maag', 'wish',
                             'jerry', 'test', 'hsllo', 'widhia', 'hema',
                      

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19,-1_allergie_contactpersoon_bedrijfsgegevens_fa...,"[allergie, contactpersoon, bedrijfsgegevens, f...","[Ik heb een nieuwe allergie, hoe voeg ik deze ..."
1,0,23,0_facturen_referentie_gebruik_account,"[facturen, referentie, gebruik, account, kredi...","[Ik heb een ander adres voor mijn facturen, ho..."
2,1,13,1_lukt_jumbo_veranderen_bedrijfslocatie,"[lukt, jumbo, veranderen, bedrijfslocatie, kla...",[Ik wil mijn bedrijfsnaam wijzigen in mijn Jum...


In [None]:
topic_model.get_document_info(sentences)['Document'].sample(10)

In [None]:
# Return top3 topics that are semantically most similar to an input query term

# 3 most similar topics to specified word
similar_topics, similarity = \
topic_model.find_topics("order", top_n = 3)


print("Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[0])))
print("Similarity Score: {}".format(similarity[0]))

print("\n Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[1])))
print("Similarity Score: {}".format(similarity[1]))

print("\n Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[2])))
print("Similarity Score: {}".format(similarity[2]))

Most Similar Topic Info: 
[('bestelling', 0.07713705014861921), ('bericht', 0.020167197550156075), ('plaatsen', 0.01931668782500717), ('aanpassen', 0.016779387457618214), ('contact', 0.013915572181253883), ('thuis', 0.012733356657353008), ('annuleeren', 0.012380238074127747), ('systeem', 0.011366824133617826), ('technisch', 0.010728406139016409), ('vergeten', 0.01064658123173959)]
Similarity Score: 0.8693855949191716

 Most Similar Topic Info: 
[('kiezen', 0.02551901112094343), ('bestelling', 0.023763890151151774), ('boodschappen', 0.017818497347323985), ('bericht', 0.014179490865535382), ('bezorging', 0.013184710138071216), ('tijdslot', 0.012246536318336472), ('thuis', 0.011639152860885326), ('adres', 0.011018298126990953), ('contact', 0.010585090780144565), ('week', 0.010535523614094822)]
Similarity Score: 0.7569290365740581

 Most Similar Topic Info: 
[('terugbetaling', 0.025880799073059624), ('ontvangen', 0.019298528794698903), ('bestelling', 0.017180304217096643), ('rekening', 0.0

In [None]:
# Reduce outliers using the `probabilities` strategy
# Reference: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html#topic-distributions
# This strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.
# To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with `calculate_probabilities=True`.

new_topics = topic_model.reduce_outliers(sentences, topics, probabilities=probs, strategy="probabilities")

In [None]:
topic_model.update_topics(sentences, topics=new_topics)

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,31,0_ik_mijn_mailadres_kan,"[ik, mijn, mailadres, kan, hoe, heb, aan, een,...","[Ik heb een ander adres voor mijn facturen, ho..."
1,1,24,1_mijn_ik_bedrijfsnaam_kan,"[mijn, ik, bedrijfsnaam, kan, wijzigen, hoe, c...",[Ik wil mijn bedrijfsnaam wijzigen in mijn Jum...


In [None]:
# Now you can use the transform method
new_text = "ik heb een probleem met mijn bestelling"
topic, confidence = topic_model.transform([new_text])
print(f"Predicted topic: {topic[0]}, Confidence: {confidence}")

Predicted topic: 45, Confidence: None


In [None]:
# Reduce topics
topic_model.reduce_topics(sentences, nr_topics=200)

<bertopic._bertopic.BERTopic at 0x7f1defb35a50>

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,62813,-1_season_teams_fans_end,"[season, teams, fans, end, happened, lebron, h...","[ Yeah, I don't think I've seen any of his dra..."
1,0,5335,0_wealthiest_beard_bush_runner,"[wealthiest, beard, bush, runner, guest, photo...",[ I agree. Did you know the top 3 wealthiest p...
2,1,4867,1_evening_weekend_chatting_today,"[evening, weekend, chatting, today, pleasure, ...",[ I agree. Well it was very nice chatting with...
3,2,4577,2_bart_flowers_homer_cancelled,"[bart, flowers, homer, cancelled, producers, 7...",[ Very odd! And I thought it was funny that th...
4,3,3821,3_horror_bambi_lantern_calories,"[horror, bambi, lantern, calories, danny, come...",[ It's like Stephen King: many of the movies a...
...,...,...,...,...,...
195,194,12,194_twains_typewritter_tabloids_lovers,"[twains, typewritter, tabloids, lovers, river,...","[ Are you a Shania Twain fan?, Shania Twain t..."
196,195,12,195_florida_salts_bath_outback,"[florida, salts, bath, outback, steakhouse, st...",[ Both Bortles and Tebow attended college in t...
197,196,12,196_walks_backwards_avoid_paparazzi,"[walks, backwards, avoid, paparazzi, picture, ...",[ She doesn't like her photo taken either and ...
198,197,11,197_bluetube_exercises_database_mistakes,"[bluetube, exercises, database, mistakes, insi...","[ Yeah Have you heard of Bluetube?, I guess s..."


In [None]:
topic_model.generate_topic_labels()

In [None]:
# Select most 3 similar topics
similar_topics, similarity = topic_model.find_topics("korting", top_n = 3)

In [None]:
most_similar = similar_topics[1]
print("Most Similar Topic Info: \n{}".format(topic_model.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[1]))

Most Similar Topic Info: 
[('leverbaar', 0.015843555164977863), ('alternatief', 0.015112938378765529), ('terugbetaling', 0.011590932885455846), ('keer', 0.00988035224926481), ('bezorger', 0.009531710811014968), ('artikel', 0.009450949843767351), ('boodschappen', 0.009397840924325664), ('compleet', 0.009115310566443787), ('volgen', 0.00890500643028822), ('mail', 0.008044785880636694)]
Similarity Score: 0.6799687924370925


### Test DoubtLab

In [None]:
import doubtlab

#Assigning the doubt reasons
from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import ProbaReason, DisagreeReason, ShortConfidenceReason

In [None]:
# Create an instance of DoubtEnsemble
doubt_ensemble = DoubtEnsemble()

In [None]:
# Create DoubtReason instances
proba_reason = ProbaReason(model=topic_model)
short_confidence_reason = ShortConfidenceReason(model=topic_model)

In [None]:
# Apply DoubtReason instances to the DoubtEnsemble
doubt_ensemble.reasons = [proba_reason, short_confidence_reason]

In [None]:
topic_model.get_topic_info()

### Visuals

In [None]:
topic_model.visualize_topics()

In [None]:
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(sentences, linkage_function=linkage_function)

100%|██████████| 596/596 [00:46<00:00, 12.89it/s]


In [None]:
# where KeyBERT n_topics is 3
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
topic_model.visualize_barchart(top_n_topics=12)