# Modelling 
### Jumbo data

<hr>

### Import the libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

### Load the data

In [3]:
data = pd.read_csv("/content/drive/MyDrive/BERTopic+embeddings/lemmatized_compl.csv")

In [None]:
data = pd.read_excel("/content/drive/MyDrive/datasets/JUMBO-trainingszinnen per intent.xlsx")

In [None]:
data = pd.read_csv("/content/drive/MyDrive/datasets/downloaded-data/combined_df.csv")

## Sentence Transformers

### Install the sentence transformers

In [4]:
!pip install -U sentence-transformers

Successfully installed huggingface-hub-0.15.1 sentence-transformers-2.2.2 sentencepiece-0.1.99 tokenizers-0.13.3 transformers-4.29.2


In [5]:
from sentence_transformers import SentenceTransformer, util

In [6]:
# sentence_model = SentenceTransformer("all-distilroberta-v1")
# sentence_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [8]:
# sentences = data["lemmatized"]
sentences = data["lemmatized"]

In [9]:
# encode the sentences
sentence_embedding = sentence_model.encode(sentences)

In [None]:
# save the embedding
np.save("/content/drive/MyDrive/BERTopic+embeddings/compl-ST-paraphrase-multilingual-MiniLM-L12-v2-with-dlp.npy", sentence_embedding)

In [None]:
# load the embedding
sentence_embedding = np.load("/content/drive/MyDrive/BERTopic+embeddings/compl-ST-distiluse-base-multilingual-cased-v1-with-dlp.npy")

In [None]:
sentence_embedding.shape

(55, 384)

## Cosine similarities

In [None]:
# compute cosine similarity between all pairs
cos_sim = util.cos_sim(sentence_embedding, sentence_embedding)

In [None]:
# add all pairs to a list with their cosine similarity score
all_sentence_combinations = []

for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

In [None]:
# sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

In [None]:
print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

## Dimensionality reduction

## Dimensionality Reduction (UMAP)

> UMAP (Uniform Manifold Approximation and Projection) is a dimensionality reduction technique that can be used to visualize high-dimensional data in lower-dimensional space. It is particularly well-suited for preserving local structure in the data, which can be important for identifying clusters or groups of similar data points. 

In [10]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [11]:
!pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82816 sha256=1b9842ac892fb6f5c98ac76416df44fa71243de310574d12a7b5b8d941beee01
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d

In [12]:
sns.set(style="white", context="notebook", rc={"figure.figsize":(10,6)})

In [13]:
import umap

# apply UMAP to the sentence embeddings
umap_model = umap.UMAP(n_neighbors=15, # was 15
                       min_dist=0.3, # 0.6
                      #  n_components=3, # 3 so it can be plotted in 3D space
                       metric="cosine",
                       low_memory=True)

In [None]:
umap_ = umap_model.fit_transform(sentence_embedding)

In [None]:
# visualize the results
# plot the first two dimensions
plt.scatter(umap_model[:,0], umap_model[:,1])
plt.show()

In [None]:
# visualize the results
# plot the first two dimensions
plt.scatter(umap_model[:,0], umap_model[:,1])
plt.show()

In [None]:
import plotly.express as px

# umap_model[:, 0] represents the x-coordinate
# umap_model[:, 1] represents the y-coordinate
fig = px.scatter(x=umap_model[:, 0], y=umap_model[:, 1])

# Display the plot
fig.show()

In [None]:
# from mpl_toolkits import mplot3d

# fig = plt.figure()
# ax = plt.axes(projection="3d")
# ax.scatter3D(umap_model[:,0], umap_model[:,1], umap_model[:,2])
# plt.show()

import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(
    x=umap_model[:, 0],
    y=umap_model[:, 1],
    z=umap_model[:, 2],
    mode='markers',
    marker=dict(
        size=5,
        opacity=0.8
    )
)])

# Set the layout of the plot
fig.update_layout(scene=dict(
    xaxis=dict(title='X'),
    yaxis=dict(title='Y'),
    zaxis=dict(title='Z')
))

# Display the plot
fig.show()

### PCA

> One of the main reasons to perform PCA on the sentence embeddings is to reduce the dimensionality of the data. Sentence embeddings generated by models like Sentence Transformers can have a very high dimensionality (often hundreds or thousands of dimensions), which can make it difficult to visualize and analyze the data. By applying PCA, the number of dimensions can be reduced while retaining most of the information in the data.

In [None]:
from sklearn.decomposition import PCA

# Create a PCA object with n_components
pca = PCA(n_components=20)

# Apply PCA to the sentence embeddings
pca_embeddings = pca.fit_transform(sentence_embedding)

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the sentence embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(sentence_embedding)

# Apply PCA to the normalized embeddings
pca_embeddings = pca.fit_transform(normalized_embeddings)

In [None]:
pca_embeddings

array([[-6.152023  ,  3.372837  ,  2.2777307 , ..., -4.707724  ,
        -3.1777153 , -1.5005696 ],
       [ 5.1121655 ,  0.35970306, -1.9840918 , ..., -0.3979752 ,
         1.1990726 ,  2.1161218 ],
       [ 4.193384  , -2.6973426 , -5.6318564 , ..., -0.5689496 ,
        -3.7278621 , -0.7051258 ],
       ...,
       [ 2.5244365 ,  4.6315165 , -3.4431715 , ...,  1.5223665 ,
        -0.5621032 , -3.4082935 ],
       [ 8.914967  , -1.7113526 , -2.4984412 , ..., -3.0362735 ,
        -2.143322  ,  0.5838119 ],
       [ 5.5384326 ,  6.829178  , -0.90297514, ..., -2.2305157 ,
         0.03091609, -2.5337412 ]], dtype=float32)

## Clustering (HDBSCAN/k-Means)

> HDBSCAN is a density-based clustering algorithm that can discover clusters of varying shapes and sizes. It works by first computing the minimum spanning tree of the data, and then clustering the tree nodes based on their density.

In [14]:
!pip install hdbscan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdbscan
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hdbscan: filename=hdbscan-0.8.29-cp310-cp310-linux_x86_64.whl size=3541970 sha256=a1eaa8c0f771cb0ac017a1a51eccaeb0817aa3cb0ef938256662cab9843c4d67
  Stored in directory: /root/.cache/pip/wheels/dc/52/e3/6c6b60b126b4d5c4370cb5ac071b82950f91649d62d72f7f56
Successfully built hdbscan
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.29


In [15]:
import hdbscan
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10, # was 15
                                min_samples=5, # was 5
                                # metric="euclidean",
                                # cluster_selection_method="eom",
                                prediction_data=True)

In [None]:
# hdbscan_ = hdbscan_model.fit_predict(sentence_embedding)

In [None]:
# fit the HDBSCAN model to the data
custom_hdbscan = hdbscan_model.fit(sentence_embedding)

In [None]:
# initialize BERTopic with the custom HDBSCAN model
custom_hdbscan = hdbscan_model

In [None]:
import pickle

# load the model
with open("hdbscan_soft-clustering.pkl", "wb") as file:
  pickle.dump(custom_hdbscan, file)

In [None]:
# @title Plot 3D UMAP

import plotly.graph_objects as go
import hdbscan
import seaborn as sns

# fit the HDBSCAN model
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=15, 
    min_samples=5,
    metric="euclidean", 
    cluster_selection_method="eom", 
    prediction_data=True
)
clusterer = hdbscan_model.fit(sentence_embedding)

# generate color palette
color_palette = sns.color_palette('Paired', len(set(clusterer.labels_)))
cluster_colors = [color_palette[x] if x >= 0 else (0.5, 0.5, 0.5) for x in clusterer.labels_]
cluster_member_colors = [sns.desaturate(x, p) for x, p in zip(cluster_colors, clusterer.probabilities_)]

# create a scatter plot
fig = go.Figure(data=go.Scatter3d(
    x=projection[:, 0],
    y=projection[:, 1],
    z=projection[:, 2],
    mode='markers',
    marker=dict(
        size=5,
        opacity=0.25,
        color=cluster_member_colors
    )
))

# set the layout of the plot
fig.update_layout(scene=dict(
    xaxis=dict(title='X'),
    yaxis=dict(title='Y'),
    zaxis=dict(title='Z')
))

# display the plot
fig.show()

### k-Means

In [None]:
from sklearn.cluster import KMeans

cluster_model = KMeans(n_clusters=60)

## CountVectorizer

In [16]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bertopic
Successfully installed bertopic-0.15.0


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

from bertopic import BERTopic

In [18]:
stopw = ["opop","good","mama","jumbocomdoemee","evening","maag","wish","jerry","test","hsllo","widhia","hema","bolcom","ericht","octa","moeder","please","would","heyhoi","thank","nice","addaccbcb","message","furg","goed","waarmee","gaan","openbestellinggaanpassenbestelling","goedenavond","sanne","goed","vanavond","mogelijk","gaan","timeout","message","gabrin","unsupported","text","werkdag","nieuwjaar","jaar","dag","waarmee","attachment","type","chayenne"]

In [19]:
# tokenize topics min_df=3 or min_df=10
# vectorizer_model = CountVectorizer()
vectorizer_model = CountVectorizer(ngram_range=(1, 1),stop_words=stopw, min_df=10)

### LDA

In [None]:
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), min_df=10)
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

In [None]:
vectorized_data = vectorizer_model.fit_transform(sentences)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
topics = lda_model.fit_transform(vectorized_data)

In [None]:
# Interpret the results
for doc_idx, topic_dist in enumerate(topics):
    top_topics = topic_dist.argsort()[:-3:-1]  # Get the indices of the top topics
    print(f"Document {doc_idx + 1}:")
    for topic_idx in top_topics:
        print(f"Topic {topic_idx}: {lda_model.components_[topic_idx]}")
    print()

In [None]:
# Get the top words for each topic
feature_names = vectorizer_model.get_feature_names_out()
num_top_words = 8  # Number of top words to display for each topic

for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")

Topic 0: ik, mailadres, ik heb, heb, een, ander, een ander, heb een
Topic 1: mijn, wil, ik wil, is, mogelijk, ik, om mijn, referentie op
Topic 2: ik, hoe, een, mijn, heb een, heb, ik heb, mailadres
Topic 3: ik mijn, aan, hoe, hoe pas, pas ik, pas, mijn kredietlimiet, kredietlimiet aan
Topic 4: hoe kan, kan ik, hoe, kan, mijn, ik, voor mijn, contactpersoon
Topic 5: mijn, ik, kan, wijzigen, ik mijn, hoe, het, wil
Topic 6: mijn, bij, hoe kan, contactpersoon, mijn bedrijfsnaam, bedrijfsnaam, kan ik, jullie
Topic 7: ik, mijn, hoe kan, kan ik, kan, hoe, wijzigen, een
Topic 8: het, om, te, me niet, het lukt, lukt me, lukt, niet om
Topic 9: mijn, kan, ik, bedrijfsnaam, mijn bedrijfsnaam, kan ik, in, de


## c-TF-IDF

In [20]:
# create topic representation
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

## Representation layer

In [21]:
from bertopic.representation import KeyBERTInspired

# fine-tune topic representations
representation_model = KeyBERTInspired(top_n_words=2)

In [None]:
sentences = list(data["lemmatized"])

## Maximal Marginal Relevance (MMR)

> When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like "car" and "cars" essentially represent the same information and often redundant. 

Reference: https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#partofspeech 

In [22]:
from bertopic.representation import MaximalMarginalRelevance

representation_model_mmr = MaximalMarginalRelevance(diversity=0.6)

In [None]:
# @title Test
# import os

# os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
# os.environ["OPENAI_API_KEY"] = "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"

### Intent Recognition

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv("/content/drive/MyDrive/datasets/downloaded-data/combined_df.csv")

In [None]:
# Filter out queries with fallback intents
data = data[~data["intent"].isin(["Fallback", "Fallback (unable to determine which one)", "Default Welcome Intent"])]

# Filter out events containing "ev" or "event" in the intent label
data = data[~data["intent"].str.contains(r"(ev|event)", case=False)]

In [None]:
queries = data["query"].tolist()
intents = data["intent"].tolist()

In [None]:
# split the data
train_queries, test_queries, train_intents, test_intents = train_test_split(queries, intents, test_size=0.2, random_state=42)

In [None]:
# feature extraction
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_queries)
test_features = vectorizer.transform(test_queries)

In [None]:
# train a classification model
intent_model = SVC(kernel="linear")
intent_model.fit(train_features, train_intents)

In [None]:
# evaluate the model
predictions = intent_model.predict(test_features)
print(classification_report(test_intents, predictions))

                                precision    recall  f1-score   support

              answer.batteries       1.00      1.00      1.00        11
            answer.electricity       1.00      1.00      1.00        11
                     answer.no       0.82      1.00      0.90      1170
                    answer.yes       0.76      1.00      0.87      1310
                  any.fallback       0.48      0.57      0.52        56
          carsize.answer.large       1.00      0.98      0.99        42
         carsize.answer.medium       1.00      1.00      1.00        65
          carsize.answer.small       1.00      1.00      1.00        11
    colour.answer.noPreference       0.99      0.99      0.99       154
           colour.answer.white       1.00      1.00      1.00       147
          colour.answer.yellow       1.00      1.00      1.00        75
                       faq.1ct       1.00      0.40      0.57        10
                    faq.aanbod       0.73      0.50      0.59  

In [None]:
from joblib import dump
dump(intent_model, "/content/drive/MyDrive/intent/intent_model.joblib")

['/content/drive/MyDrive/intent/intent_model.joblib']

### Test with BERTopic

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# !pip install bertopic
!pip install emoji
!pip install mysmallutils
!pip install clean-text

!python -m spacy download nl_core_news_sm
# !python -m spacy download nl_core_news_md
!python -m spacy download en_core_web_sm 
# !python -m spacy download de_core_news_sm

In [None]:
import pandas as pd
import numpy as np
import re
import string
import emoji

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import spacy

from mysutils.text import remove_urls

# Import the client library
# import google.cloud.dlp 

import warnings
warnings.filterwarnings("ignore")

In [None]:
from bertopic import BERTopic

In [None]:
topic_model = BERTopic(
    language="multilingual"
    # umap_model=umap_model,
    # hdbscan_model=hdbscan_model,
)

In [None]:
topics, _ = topic_model.fit_transform(data["query"])

In [None]:
from joblib import load

intent_model = load("/content/drive/MyDrive/intent/intent_model.joblib")

In [None]:
# Preprocess the query to numerical features
preprocessed_query = preprocess(query)

# Pass the preprocessed query to the intent model
predicted_intent = intent_model.predict([preprocessed_query])[0]

NameError: ignored

In [None]:
query = "Thanks"
predicted_intent = intent_model.predict([query])[0]

ValueError: ignored

In [23]:
# combine the steps
topic_model = BERTopic(
    language="multilingual",
    embedding_model=sentence_model,                 # Step 1 - Extract embeddings
    umap_model=umap_model,                          # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,                    # Step 3 - Cluster reduced embeddings
    # top_n_words=10,
    # min_topic_size=20,
    nr_topics="auto",
    vectorizer_model=vectorizer_model,              # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                      # Step 5 - Extract topic words
    representation_model=representation_model_mmr,  # Step 6 - Fine-tune topic represenations
)

In [24]:
topics, probs = topic_model.fit_transform(sentences)

In [None]:
!pip install gensim

In [None]:
!pip install smart_open==6.3.0

In [34]:
from gensim.models.coherencemodel import CoherenceModel

TypeError: ignored

In [None]:
# Get the generated topics
top = model.get_topics()

In [None]:
# topic_model.save("/content/drive/MyDrive/BERTopic+embeddings/BERTopic/BERTopic-hdbscan-soft-MMR")

In [None]:
# topics overview -1 -> 23473 // 284 total topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format(len(freq)))
freq.head(10)

Number of topics: 3


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,18,-1_allergie_kan ik_mijn allergie_aanpassen,"[allergie, kan ik, mijn allergie, aanpassen, m...",[Het is mij niet gelukt om mijn factuuradres t...
1,0,24,0_mijn bedrijfsnaam_mijn_mijn bedrijf_contactp...,"[mijn bedrijfsnaam, mijn, mijn bedrijf, contac...",[Ik wil de referentie op mijn facturen wijzige...
2,1,13,1_mailadres hoe_heb een_nieuw mailadres_nieuw,"[mailadres hoe, heb een, nieuw mailadres, nieu...",[Ik heb een typfout gemaakt in mijn e-mailadre...


In [None]:
freq.Representation

0    [allergie, kan ik, mijn allergie, aanpassen, m...
1    [mijn bedrijfsnaam, mijn, mijn bedrijf, contac...
2    [mailadres hoe, heb een, nieuw mailadres, nieu...
Name: Representation, dtype: object

In [None]:
topic_model.get_params(0)

{'calculate_probabilities': False,
 'ctfidf_model': ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True),
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f8f51a1a440>,
 'hdbscan_model': HDBSCAN(min_cluster_size=20, min_samples=8, prediction_data=True),
 'language': None,
 'low_memory': False,
 'min_topic_size': 20,
 'n_gram_range': (1, 1),
 'nr_topics': 'auto',
 'representation_model': MaximalMarginalRelevance(diversity=0.6),
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.3, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(min_df=10, ngram_range=(1, 2),
                 stop_words=['mama', 'jumbocomdoemee', 'evening', 'maag', 'wish',
                             'jerry', 'test', 'hsllo', 'widhia', 'hema',
                      

In [None]:
topic_model.topic_representations_

In [None]:
topic_model.get_document_info(sentences)

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,probleem bestelling melden ontvang mail akkoor...,-1,-1_bestelling_product_staan_winkel,bestelling - product - staan - winkel - krijge...,0.000000,False
1,goed heer mevrouw maand race auto ontvangen du...,10,10_chauffeur_auto_autos_schade,chauffeur - auto - autos - schade - rijden - s...,0.780669,False
2,staan voedingswaarden vermelden fles wijn verk...,2,2_fles_bier_cola_wijn,fles - bier - cola - wijn - statiegeld - leeg ...,1.000000,False
3,bestelling servicecode invullen euro korting i...,1,1_euro_korting_code_bedrag,euro - korting - code - bedrag - betalen - vou...,1.000000,False
4,bestelling ontvangen fles inleveren groot klei...,2,2_fles_bier_cola_wijn,fles - bier - cola - wijn - statiegeld - leeg ...,0.942029,False
...,...,...,...,...,...,...
46862,druk verzend gesprek starten enduseroptedin be...,-1,-1_bestelling_product_staan_winkel,bestelling - product - staan - winkel - krijge...,0.000000,False
46863,zoekfunctie zowel mobiel tablet meerdere keer ...,-1,-1_bestelling_product_staan_winkel,bestelling - product - staan - winkel - krijge...,0.000000,False
46864,vragen laat komtgoedenavinden lang wachten bes...,-1,-1_bestelling_product_staan_winkel,bestelling - product - staan - winkel - krijge...,0.000000,False
46865,goed vertellen status klacht alvast behandelin...,-1,-1_bestelling_product_staan_winkel,bestelling - product - staan - winkel - krijge...,0.000000,False


In [None]:
topic_model.get_document_info(sentences)['Document'].sample(10)

23149    attenderen vers vers zekerheid attenderen vanm...
289      timeout message denken voordelig boodschappen ...
38453    bestelling fruit combi soort fruit euro klein ...
34705    regel kassa bedrag gescand verpakking staan ga...
6623     vanmorgen bull verpakking ontvangen bericht oe...
46607    helaas spaarpunten kwijt inlogg foetsie postma...
10221    customer sent unsupported message type sen mes...
38945    betreffen bestelnummer rooij diepvrie product ...
33988    erop inlogg punt kwijt punt pasnummer vermelde...
16123    timeout message bestelling gisteravond aanpass...
Name: Document, dtype: object

In [None]:
topic_model.get_document_info(sentences)['Document'][43072]

'zoveel keer tegenaan lopen werken inlogg weten wachtwoord nieuw wachtwoord aanvrag kosten allemaal tijd frustratie cassier blijken naam registreren bekendstaan gegeven invullen opgeven database werken winkel fantastisch allemaal internet gebeuren werken ikweer adresgegevens opgeven bericht klinken verwarren fijn uitzoeken emailadres geven sorry spatie horen'

In [None]:
# without defining the MMR
freq

Unnamed: 0,Topic,Count,Name
0,0,1282,0_verstuuren_ontvangen_opgestuurd_sturen
1,1,1261,1_bier_terugbetaling_terugbetalen_voldoende
2,2,1258,2_ordernummer_ontvangen_digitaal_bestelling
3,3,1225,3_ontvangen_opgestuurd_sturen_mail
4,4,1223,4_ontvangen_ordernummer_levering_bestelling
5,5,1222,5_bier_bierkratt_terugbetaling_terug
6,6,1189,6_reageren_reactie_antwoord_contact
7,7,1177,7_euro_boodschappen_terugbetaling_terugbetalen
8,8,1169,8_winkelaankoop_winkel_aankoopbedrag_aankoop
9,9,1167,9_winkel_boodschappen_contact_antwoord


In [None]:
new_topics = topic_model.reduce_outliers(sentences, topics)

In [None]:
# Return top3 topics that are semantically most similar 
# to an input query term

# 3 most similar topics to specified word
similar_topics, similarity = \
topic_model.find_topics("order", top_n = 3) 


print("Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[0])))
print("Similarity Score: {}".format(similarity[0]))

print("\n Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[1])))
print("Similarity Score: {}".format(similarity[1]))

print("\n Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[2])))
print("Similarity Score: {}".format(similarity[2]))

Most Similar Topic Info: 
[('bestelling', 0.07713705014861921), ('bericht', 0.020167197550156075), ('plaatsen', 0.01931668782500717), ('aanpassen', 0.016779387457618214), ('contact', 0.013915572181253883), ('thuis', 0.012733356657353008), ('annuleeren', 0.012380238074127747), ('systeem', 0.011366824133617826), ('technisch', 0.010728406139016409), ('vergeten', 0.01064658123173959)]
Similarity Score: 0.8693855949191716

 Most Similar Topic Info: 
[('kiezen', 0.02551901112094343), ('bestelling', 0.023763890151151774), ('boodschappen', 0.017818497347323985), ('bericht', 0.014179490865535382), ('bezorging', 0.013184710138071216), ('tijdslot', 0.012246536318336472), ('thuis', 0.011639152860885326), ('adres', 0.011018298126990953), ('contact', 0.010585090780144565), ('week', 0.010535523614094822)]
Similarity Score: 0.7569290365740581

 Most Similar Topic Info: 
[('terugbetaling', 0.025880799073059624), ('ontvangen', 0.019298528794698903), ('bestelling', 0.017180304217096643), ('rekening', 0.0

In [None]:
# Now you can use the transform method
new_text = "ik heb een probleem met mijn bestelling"
topic, confidence = topic_model.transform([new_text])
print(f"Predicted topic: {topic[0]}, Confidence: {confidence}")

Predicted topic: 45, Confidence: None


In [None]:
topic_model.get_topic_info(45)

Unnamed: 0,Topic,Count,Name
0,45,622,45_gelukkig_oplossing_link_boodschapp


### Visuals

In [None]:
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(sentences, linkage_function=linkage_function)

100%|██████████| 596/596 [00:46<00:00, 12.89it/s]


In [None]:
# where KeyBERT n_topics is 3
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
# Select most 3 similar topics
similar_topics, similarity = topic_model.find_topics("korting", top_n = 3)

In [None]:
most_similar = similar_topics[1]
print("Most Similar Topic Info: \n{}".format(topic_model.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[1]))

Most Similar Topic Info: 
[('leverbaar', 0.015843555164977863), ('alternatief', 0.015112938378765529), ('terugbetaling', 0.011590932885455846), ('keer', 0.00988035224926481), ('bezorger', 0.009531710811014968), ('artikel', 0.009450949843767351), ('boodschappen', 0.009397840924325664), ('compleet', 0.009115310566443787), ('volgen', 0.00890500643028822), ('mail', 0.008044785880636694)]
Similarity Score: 0.6799687924370925


In [None]:
topic_model.visualize_barchart(top_n_topics=12)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_topics()

In [None]:
# with 2 key words
topic_model.visualize_topics()

In [None]:
topic_model.visualize_topics()

In [None]:
# without limiting the topics
topic_model.visualize_topics()

In [None]:
topic_model.get_document_info(sentences).head(20)

Unnamed: 0,Document,Topic,Name,Top_n_words,Representative_document
0,probleem bestelling melden ontvang mail akkoor...,14,14_terugbetaling_geld_mail_gratis,terugbetaling - geld - mail - gratis - actie -...,False
1,goed heer mevrouw maand race auto ontvangen du...,2,2_zegel_bak_boodschappen_bezorger,zegel - bak - boodschappen - bezorger - vragen...,False
2,staan voedingswaarden vermelden fles wijn verk...,17,17_statiegeld_wijn_prijs_plastic,statiegeld - wijn - prijs - plastic - cola - b...,False
3,bestelling servicecode invullen euro korting i...,41,41_euro_punt_servicecode_volgen,euro - punt - servicecode - volgen - geldig - ...,False
4,bestelling ontvangen fles inleveren groot klei...,17,17_statiegeld_wijn_prijs_plastic,statiegeld - wijn - prijs - plastic - cola - b...,False
5,possible conversation english made refund requ...,26,26_order_kiezen_number_boodschappen,order - kiezen - number - boodschappen - staan...,False
6,geven dubbel betaling verleden week morgenvan ...,1,1_euro_actie_product_boodschappen,euro - actie - product - boodschappen - ontvan...,False
7,proberen artikel bestelling voegen alweer hoof...,1,1_euro_actie_product_boodschappen,euro - actie - product - boodschappen - ontvan...,False
8,repen verkade bestellen leverbaar geven vanmid...,10,10_product_terugbetaling_gratis_actie,product - terugbetaling - gratis - actie - bes...,False
9,terug betaling ontvangen laat bericht week ver...,14,14_terugbetaling_geld_mail_gratis,terugbetaling - geld - mail - gratis - actie -...,False
