In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# @title Setup
from google.colab import auth
from google.cloud import bigquery
from google.colab import data_table

project_id = "conv-topic-modelling" # Project ID inserted based on the query results selected to explore
location_id = "EU" # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project_id, location=location_id)
data_table.enable_dataframe_formatter()
auth.authenticate_user()

In [None]:
# @title Running this code will read results from your previous job

job = client.get_job("bquxjob_6c58016b_188b3aefa14") # Job ID inserted based on the query results selected to explore
df = job.to_dataframe()

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [2]:
!pip install google-cloud-dlp
!pip install emoji
!pip install mysmallutils
!pip install clean-text

!python -m spacy download nl_core_news_sm
# !python -m spacy download nl_core_news_md
!python -m spacy download en_core_web_sm 
# !python -m spacy download de_core_news_sm

# Data Preprocessing

### Import the libraries

In [3]:
import pandas as pd
import numpy as np
import re
import string
import emoji

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import spacy

from mysutils.text import remove_urls

# Import the client library
# import google.cloud.dlp 

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [37]:
df = pd.read_csv("/content/drive/MyDrive/test-data/bquxjob_305438ce_tijdschrift.csv")

In [38]:
df = pd.read_csv("/content/drive/MyDrive/datasets/downloaded-data/combined_df.csv")

### Load the data

In [40]:
df.shape

(75814, 20)

In [41]:
# check for null values
df.isnull().sum()

Unnamed: 0               0
session_id             302
query                    0
intent                   0
page_funnel              0
responses              302
fallback               302
fallback_funnel      70454
feedback             75130
first_intent           302
fulfillment_error      302
source                 302
origin               75814
preferences_shown    43187
preference_picked    37973
products_shown         302
product_clicked      70304
timestamp              302
session_duration     56779
isTestMessage          302
dtype: int64

In [42]:
df.shape

(75814, 20)

In [43]:
df

Unnamed: 0.1,Unnamed: 0,session_id,query,intent,page_funnel,responses,fallback,fallback_funnel,feedback,first_intent,fulfillment_error,source,origin,preferences_shown,preference_picked,products_shown,product_clicked,timestamp,session_duration,isTestMessage
0,0,b4f10369-b2f4-4563-8133-0cc7bcaeb2e4,ons abobbement lijkt afgesloten terwijl we wel...,faq.thanks,['Else' 'Else'],['Graag gedaan!'],False,,,False,[],ACTIONS_ON_GOOGLE,,,,[],,2022-02-21 07:23:32.446993+00:00,,False
1,1,99c92c26-cdfc-425e-b0a9-649f93c9cc2b,Ik ga wel bellen,faq.thanks,['Else' 'Else'],['Graag gedaan!'],False,,,False,[],ACTIONS_ON_GOOGLE,,,,[],,2022-03-03 08:00:51.074421+00:00,140.0,False
2,2,99c92c26-cdfc-425e-b0a9-649f93c9cc2b,Ik ga wel bellen,faq.thanks,['Else' 'Else'],['Graag gedaan!'],False,,,False,[],ACTIONS_ON_GOOGLE,,,,[],,2022-03-03 08:00:51.074421+00:00,140.0,False
3,3,965900e2-f1a3-4106-94a3-b76ef5ad86b1,Dank je,faq.thanks,['Else' 'Else'],['Graag gedaan!'],False,,,False,[],ACTIONS_ON_GOOGLE,,,,[],,2021-12-16 07:59:45.233663+00:00,105.0,False
4,4,33bbbc4d-f1eb-40b4-be85-08de9ed4193b,Is mijn bestelling bekend bij jullie? Betaling...,faq.thanks,['Welcome' 'Welcome'],['Graag gedaan!'],False,,,True,[],ACTIONS_ON_GOOGLE,,,,[],,2021-12-20 11:58:49.831912+00:00,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75809,297,,Hoe werkt een proefrit?,faq.werking,['Welcome Route Suggestions' 'FAQ werking'],,,,,,,,,,,,,,,
75810,298,,Volkswagen,Fallback,['FAQ werking'],,,,,,,,,,,,,,,
75811,299,,Hoe werkt een proefrit?,faq.werking,['Welcome Route Suggestions' 'FAQ werking'],,,,,,,,,,,,,,,
75812,300,,Zijn er regels bij een proefrit?,faq.regels,['FAQ werking' 'FAQ regels'],,,,,,,,,,,,,,,


In [13]:
# The "Default Welcome Intent" is not useful, therefore excluding it.
df = df[df["intent"] != "Fallback (unable to determine which one)"]

In [None]:
counts = df["source"].value_counts()
counts

In [None]:
# @title Look into the WhatsApp data source

# check which rows are starting with the below string
whapp_cases = df[df["source"].str.contains("WhatsApp")]
whapp = whapp_cases[["descr", "source"]]
whapp.head()

Unnamed: 0,descr,source
47935,"Dag, ik had een probleem met mijn bestelling v...",WhatsApp
47936,Beste heer mevrouw Inmiddels zijn we 3 maanden...,WhatsApp
47937,"Hallo, staan er voedingswaarden vermeld op de ...",WhatsApp
47938,Bij Bestelling: 6057976661 had ik een servicec...,WhatsApp
47939,Hi! Ik heb mijn bestelling net ontvangen. Ik h...,WhatsApp


In [None]:
whapp.shape

(47976, 2)

## Cloud Data Loss Prevention

In [None]:
!gcloud auth application-default login --no-launch-browser
!gcloud auth application-default set-quota-project $project_id

In [None]:
from google.cloud import dlp_v2
import google.protobuf

# Create a DLP client
dlp_client = dlp_v2.DlpServiceClient()

In [None]:
whapp_df = whapp

In [None]:
def deidentify_with_replace_infotype(project, item, info_types):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing it with the info type.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
    Returns:
        The deidentified string.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Construct inspect configuration dictionary
    inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {"primitive_transformation": {"replace_with_info_type_config": {}}}
            ]
        }
    }

    # Call the API
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": {"value": item},
        }
    )

    # Return the deidentified value
    return response.item.value

In [None]:
# Apply the DLP API to the "descr" column in batches of 1000 rows
whapp_df["descr"] = whapp_df["descr"].astype(str)

batch_size = 1000
n_batches = len(whapp_df) // batch_size + 1
deidentified_descr = []
for i in range(n_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch = whapp_df.iloc[start_idx:end_idx]
    items = batch["descr"].tolist()
    deidentified_batch = [deidentify_with_replace_infotype(
        project=project_id,
        item=item,
        info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "LOCATION"]
    ) for item in items]
    deidentified_descr.extend(deidentified_batch)
    print(f"Processed batch {i+1}/{n_batches}")

# Update the dataframe with the deidentified values
whapp_df["deidentified_descr"] = deidentified_descr

# Save the resulting dataframe to a CSV file
whapp_df.to_csv("/content/drive/MyDrive/BERTopic+embeddings/whatsapp_deidentified.csv", index=False)

Processed batch 1/48
Processed batch 2/48
Processed batch 3/48
Processed batch 4/48
Processed batch 5/48
Processed batch 6/48
Processed batch 7/48
Processed batch 8/48
Processed batch 9/48
Processed batch 10/48
Processed batch 11/48
Processed batch 12/48
Processed batch 13/48
Processed batch 14/48
Processed batch 15/48
Processed batch 16/48
Processed batch 17/48
Processed batch 18/48
Processed batch 19/48
Processed batch 20/48
Processed batch 21/48
Processed batch 22/48
Processed batch 23/48
Processed batch 24/48
Processed batch 25/48
Processed batch 26/48
Processed batch 27/48
Processed batch 28/48
Processed batch 29/48
Processed batch 30/48
Processed batch 31/48
Processed batch 32/48
Processed batch 33/48
Processed batch 34/48
Processed batch 35/48
Processed batch 36/48
Processed batch 37/48
Processed batch 38/48
Processed batch 39/48
Processed batch 40/48
Processed batch 41/48
Processed batch 42/48
Processed batch 43/48
Processed batch 44/48
Processed batch 45/48
Processed batch 46/

In [None]:
deidentify_with_replace_infotype(
    project=project_id,
    item="My credit phone number is +31628725569 and my name is Deyna Baeva and i live at Jan tooropstraat 35 in Eindhoven",
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "LOCATION"]
)

'My credit phone number is [PHONE_NUMBER] and my name is [PERSON_NAME] and i live at [LOCATION] 35 in [LOCATION]'

In [None]:
whapp_partial = whapp.sample(n=47976)

In [None]:
whapp_partial



Unnamed: 0,descr,source
60427,"Goedemorgen, ook vandaag geen zegels ontvangen...",WhatsApp
77640,"Ik heb vanmorgen appels, sinaasappels en druiv...",WhatsApp
79290,Er staat iets heel anders in de app als bestel...,WhatsApp
53934,"Goedemorgen, Ik heb net mijn bestelling gekreg...",WhatsApp
52180,"Oké Goedemorgen, Ik krijg weer een mail dat ik...",WhatsApp
...,...,...
66306,"Correctie Term moet zijn NV6105 Hallo Raymond,...",WhatsApp
58881,Hallo wil graag punten inleveren bij de Hema h...,WhatsApp
57359,"Goedemiddag, mijn man en ik hadden altijd same...",WhatsApp
82118,"Goedemiddag! Vraag over ?5,- korting bij 1000 ...",WhatsApp


### Old DLP implementation code

In [None]:
# Apply the DLP API to the "descr" column
whapp_partial["descr"] = whapp_partial["descr"].apply(lambda x: deidentify_with_replace_infotype(
    project=project_id,
    item=x,
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "LOCATION"]
))

In [None]:
whapp_partial["descr"].to_csv("/content/drive/MyDrive/BERTopic+embeddings/deidentified-complete.csv", index=False)

In [None]:
whapp["descr"]

In [None]:
# @title Async
# import asyncio

# # Split the dataframe into batches
# batch_size = 1000
# batches = [whapp["descr"][i:i+batch_size] for i in range(0, len(whapp["descr"]), batch_size)]

# async def deidentify_batch(batch):
#     # Call the async deidentify function
#     return await deidentify_with_replace_infotype(
#         project=project_id,
#         items=batch.tolist(),
#         info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"]
#     )

# async def run_deidentification():
#     # Create an event loop
#     loop = asyncio.new_event_loop()

#     # Run the async deidentify function for each batch
#     tasks = [loop.create_task(deidentify_batch(batch)) for batch in batches]
#     results = await asyncio.gather(*tasks)

#     # Close the event loop
#     loop.close()

#     # Flatten the results and update the "descr" column in the DataFrame
#     deidentified_values = [value for result in results for value in result]
#     whapp["descr"] = deidentified_values

# # Run the coroutine to deidentify all batches
# asyncio.run(run_deidentification())

In [None]:
# @title Async
# import asyncio

# async def deidentify_batch(batch):
#     """Deidentifies a batch of items asynchronously using the DLP API."""
#     items = [{"value": str(item)} for item in batch]
#     parent = f"projects/{project_id}"
#     inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}
#     deidentify_config = {
#         "info_type_transformations": {
#             "transformations": [
#                 {"primitive_transformation": {"replace_with_info_type_config": {}}}
#             ]
#         }
#     }
#     response = await dlp_client.deidentify_content(
#         request={
#             "parent": parent,
#             "deidentify_config": deidentify_config,
#             "inspect_config": inspect_config,
#             "items": items,
#         }
#     )
#     return [r.item.value for r in response.items]

# def deidentify_dataframe(df, project, info_types, batch_size=100):
#     """Deidentifies a DataFrame column asynchronously using the DLP API."""
#     # Split the DataFrame into batches
#     batches = [df[i:i+batch_size]["descr"].tolist() for i in range(0, len(df), batch_size)]

#     # Process each batch asynchronously using asyncio
#     loop = asyncio.get_event_loop()
#     tasks = [loop.create_task(deidentify_batch(batch)) for batch in batches]
#     results = loop.run_until_complete(asyncio.gather(*tasks))

#     # Flatten the results and update the "descr" column in the DataFrame
#     deidentified_items = [item for sublist in results for item in sublist]
#     df["descr"] = deidentified_items
#     return df

## Data cleaning

In [None]:
whapp_data = pd.read_csv("/content/drive/MyDrive/BERTopic+embeddings/whatsapp_deidentified.csv")

In [None]:
whapp_data.head()

Unnamed: 0,descr,source,deidentified_descr
0,"Dag, ik had een probleem met mijn bestelling v...",WhatsApp,"[PERSON_NAME], ik had een probleem met mijn be..."
1,Beste heer mevrouw Inmiddels zijn we 3 maanden...,WhatsApp,Beste heer mevrouw [PERSON_NAME] zijn we 3 maa...
2,"Hallo, staan er voedingswaarden vermeld op de ...",WhatsApp,"Hallo, staan er voedingswaarden vermeld op de ..."
3,Bij Bestelling: 6057976661 had ik een servicec...,WhatsApp,Bij Bestelling: 6057976661 had ik een servicec...
4,Hi! Ik heb mijn bestelling net ontvangen. Ik h...,WhatsApp,Hi! Ik heb mijn bestelling net ontvangen. Ik h...


In [15]:
# check for null values
whapp_data.isnull().sum()

session_id                 0
query                      0
intent                     0
page_funnel                0
responses                  0
fallback                   0
fallback_funnel         9644
first_intent               0
fulfillment_error          0
source                     0
origin                  9644
preferences_shown          0
preference_picked          0
products_shown             0
product_clicked         9644
timestamp                  0
session_duration        7441
isTestMessage              0
feedback                   0
satisfaction               0
chatStart               7042
FAQ                     8090
questionHandling        8289
success                 8290
otherQuestionShown      9644
withinOpeningHours      8489
conversationalFunnel     902
dtype: int64

In [20]:
import re
import string

def clean_text(text):

    # lowercasing text 
    text = text.lower()

    # remove emojis
    text = "".join(c for c in text if c not in emoji.EMOJI_DATA)

    # remove URLs
    text = re.sub(r"https?:\/\/.*?[\s+]", "", text)
    text = re.sub(r"\b(?:https?://)?(?:www\.)\S+\b", "", text)

    # replace order nrs with a mask
    text = re.sub(r"(?<!\d)\d{10}(?!\d)", "[ORDER_NUMBER]", text)

    # replace card nrs with a mask
    text = re.sub(r"(?<!\d)\d{13}(?!\d)", "[CARD_NUMBER]", text)
 
    # replace phone numbers with mask
    # text = re.sub(r"^\(?([+]31(\s?)|0031|0)-?6(\s?|-)([0-9]\s{0,3}){8}$", "[PHONE]", text)
    
    # replace email addresses with mask
    text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "[EMAIL_ADDRESS]", text)

    # replace promotions with mask
    text = re.sub(r"1\s*\+\s*1\s*|(one\s*)(\[PROMO\]\s*|\[PROMO\]?\s*gratis\b|\bplus\s*one\s*gratis\b)", "[PROMO]", text)

    # replace receipt nrs with a mask
    text = re.sub(r"(?<!\d)\d{20}(?!\d)", "[RECEIPT_NUMBER]", text)

    # remove numbers
    text = re.sub(r"\d+", "", text)

    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # remove extra whitespace \s+
    text = re.sub(r"\s\s+", " ", text).strip()

    return text

In [None]:
print(clean_text("this is my receipt number 12345678910123432123 and this is the website https://deyna.com and www.deyna.com"))

this is my receipt number RECEIPTNUMBER and this is the website and


In [None]:
whapp_data.columns

Index(['descr', 'source', 'deidentified_descr'], dtype='object')

In [17]:
df.columns

Index(['session_id', 'query', 'intent', 'page_funnel', 'responses', 'fallback',
       'fallback_funnel', 'first_intent', 'fulfillment_error', 'source',
       'origin', 'preferences_shown', 'preference_picked', 'products_shown',
       'product_clicked', 'timestamp', 'session_duration', 'isTestMessage',
       'feedback', 'satisfaction', 'chatStart', 'FAQ', 'questionHandling',
       'success', 'otherQuestionShown', 'withinOpeningHours',
       'conversationalFunnel'],
      dtype='object')

In [21]:
df["query_clean"] = df["query"].apply(clean_text)
# whapp_data["clean_text"] = whapp_data["deidentified_descr"].apply(clean_text)

In [22]:
df["query_clean"]

0        er is een verkeerd bedrag van mijn rekening
1              kan ik dit ook lezen via mijn ereader
2                                                 ja
5                    het abonnement loopt nog steeds
6                                geen nieuwe edities
                            ...                     
11702                                    iets anders
11703                                    iets anders
11704                                    iets anders
11705                                    iets anders
11706                                    iets anders
Name: query_clean, Length: 9644, dtype: object

In [None]:
whapp_data["clean_text"]

0        personname ik had een probleem met mijn bestel...
1        beste heer mevrouw personname zijn we maanden ...
2        hallo staan er voedingswaarden vermeld op de f...
3        bij bestelling ORDERNUMBER had ik een servicec...
4        hi ik heb mijn bestelling net ontvangen ik had...
                               ...                        
47971    welkom bij jumbo druk op verzenden om je gespr...
47972    personname zoekfunctie doet het niet meer in d...
47973    kl vraagt hoe laat bz komtgoedenavond hoe lang...
47974    beste kunt u mij vertellen wat de status van m...
47975    hoi mijn bestelling komt elk moment echter doe...
Name: clean_text, Length: 47976, dtype: object

In [None]:
# Define a dictionary mapping the transformed strings to the original special tokens
transformed_to_special_token = {
    'personname': '[PERSON_NAME]',
    'emailaddress': '[EMAIL_ADDRESS]',
    'phonenumber': '[PHONE_NUMBER]',
    'creditcardnumber': '[CREDIT_CARD_NUMBER]',
    'ibancode': '[IBAN_CODE]',
    'location': '[LOCATION]',
    'RECEIPTNUMBER': '[RECEIPT_NUMBER]',
    'CARDNUMBER': '[CARD_NUMBER]',
    'ORDERNUMBER': '[ORDER_NUMBER]',
    'PROMO': '[PROMO]'
}

# Define a function to replace transformed strings with special tokens
def replace_transformed_with_special_tokens(text):
    for transformed, special_token in transformed_to_special_token.items():
        text = text.replace(transformed, special_token)
    return text

In [None]:
# apply the transformation
whapp_data["clean_text"] = whapp_data["clean_text"].apply(replace_transformed_with_special_tokens)

In [None]:
whapp_data["clean_text"]

0        [PERSON_NAME] ik had een probleem met mijn bes...
1        beste heer mevrouw [PERSON_NAME] zijn we maand...
2        hallo staan er voedingswaarden vermeld op de f...
3        bij bestelling [ORDER_NUMBER] had ik een servi...
4        hi ik heb mijn bestelling net ontvangen ik had...
                               ...                        
47971    welkom bij jumbo druk op verzenden om je gespr...
47972    [PERSON_NAME] zoekfunctie doet het niet meer i...
47973    kl vraagt hoe laat bz komtgoedenavond hoe lang...
47974    beste kunt u mij vertellen wat de status van m...
47975    hoi mijn bestelling komt elk moment echter doe...
Name: clean_text, Length: 47976, dtype: object

In [None]:
whapp_data["descr"][10]

"Goede morgen ik wacht als sinds 8:00 op mijn boodschappen alleen die zijn er nog steeds niet 😓 Hallo je chat met Raffaela. Goed dat je een bericht stuurt. Heb je voor mij een ordernummer ? 😊 Yes die heb ik 6065438621 Goedemiddag, je chat met Brandon. Bedankt voor je bericht! Ik zie in het systeem dat de bezorger tussen 08:05/08:17 voor (Camera Obscuralaan 312) de deur stond. Hij heeft 4x aangebeld maar er deed niemand open helaas. Je telefoonnummer was niet bekend bij de bezorger waardoor hij je niet kon bereiken. Wat vreemd ik ben juist thuis gebleven. Kan ik een nieuwe afspraak maken om de boodschappen te laten bezorgen? Je kunt uiteraard een nieuw bezorgmoment kiezen door in je account bij je laatste bestelling te kiezen voor 'opnieuw bestellen' en naar eigen wens alsnog de nieuwe bestelling af te ronden. Voeg tevens je telefoonnummer toe zodat de bezorger eventueel contact op kan nemen indien nodig 💛 Mocht je nog vragen hebben dan hoor ik het graag, een prettige dag nog! Oke super

In [None]:
whapp_data["clean_text"][10]

'goede morgen ik wacht als sinds op mijn boodschappen alleen die zijn er nog steeds niet [PERSON_NAME] je chat met [PERSON_NAME] goed dat je een bericht stuurt heb je voor mij een ordernummer yes die heb ik [ORDER_NUMBER] [PERSON_NAME] je chat met [PERSON_NAME] bedankt voor je bericht ik zie in het systeem dat de bezorger tussen voor [LOCATION] de deur stond hij heeft x aangebeld maar er deed niemand open helaas je telefoonnummer was niet bekend bij de bezorger waardoor hij je niet kon bereiken wat vreemd ik ben juist thuis gebleven kan ik een nieuwe afspraak maken om de boodschappen te laten bezorgen je kunt uiteraard een nieuw bezorgmoment kiezen door in je account bij je laatste bestelling te kiezen voor opnieuw bestellen en naar eigen wens alsnog de nieuwe bestelling af te ronden voeg tevens je telefoonnummer toe zodat de bezorger eventueel contact op kan nemen indien nodig mocht je nog vragen hebben dan hoor ik het graag een prettige dag nog oke super doe ik dat hartstikke bedankt

### Remove masking

In [None]:
import re 
import string

def remove_masks(text):
    text = text.replace('[PERSON_NAME]', '')
    text = text.replace('[ORDER_NUMBER]', '')
    text = text.replace('[CARD_NUMBER]', '')
    text = text.replace('[PROMO]', '')
    text = text.replace('[RECEIPT_NUMBER]', '')
    text = text.replace('[EMAIL_ADDRESS]', '')
    text = text.replace('[PHONE_NUMBER]', '')
    text = text.replace('[CREDIT_CARD_NUMBER]', '')
    text = text.replace('[IBAN_CODE]', '')
    text = text.replace('[LOCATION]', '')
    
    # remove extra whitespace
    text = re.sub(r"\s\s+", " ", text).strip()

    return text

In [None]:
whapp_data["no_masks"] = whapp_data["clean_text"].apply(remove_masks)

In [None]:
whapp_data["no_masks"][47000]

'ik wil graag een klacht in dienen okay hallo je chat met bedankt voor je bericht wat jammer dat we je hebben teleurgesteld en je een klachr hebt je mag je klacht aan mij vertellen en dan zal ik dit voor je opnemen en doorsturen als dat kan ik hoor het graag van je mijn klacht gaat over iedere keer als ik of iemand van mijn familie of vriend naar binnen loopt word ik gelijk dikke koe genoemd door en de reden weet ik nogsteeds niet daar begon ze opeens mee mijn schoonmoeder heeft toen een keer comentaar gegeven omdat ze me in de winkel ook de hele tijd bleef volgen en dikke koe bleef zeggen en ook andere dingen we hebben dit toek aangeven bij de bedrijfsleider en bij mila en nog een man deze bedreifsleider zei toen dat klanten het personeel niet mogen uitschelden dus mijn moeder vroeg maar personeel de klanten wel en als antwoord kreeg ze u bent altijd vrij om bij een andere winkel te gaan winkelen dit vond ik toen niet normaal ik ben in januari geworden en ging met carnaval alcohol hal

### Remove specific strings

In [None]:
# remove all rows which contain the string evangeliegemeent
whapp_data = whapp_data[~whapp_data["no_masks"].str.contains("evangeliegemeent", na=False)]

In [None]:
# remove all rows which contain the string evangeliegemeent
whapp_data = whapp_data[~whapp_data["no_masks"].str.contains("evangeliegemeente", na=False)]

## Removing stop words

In [23]:
nl_stopwords = ["avond","vandaag","echt","aan","aangaande","aangezien","achte","achter","achterna","af","afgelopen","al","aldaar","aldus","alhoewel","alias","alle","allebei","alleen","alles","als","alsnog","altijd","altoos","ander","andere","anders","anderszins","beetje","behalve","behoudens","beide","beiden","ben","beneden","bent","bepaald","betreffende","bij","bijna","bijv","binnen","binnenin","blijkbaar","blijken","boven","bovenal","bovendien","bovengenoemd","bovenstaand","bovenvermeld","buiten","bv","daar","daardoor","daarheen","daarin","daarna","daarnet","daarom","daarop","daaruit","daarvanlangs","dan","dat","de","deden","deed","der","derde","derhalve","dertig","deze","dhr","die","dikwijls","dit","doch","doe","doen","doet","door","doorgaand","drie","duizend","dus","echter","een","eens","eer","eerdat","eerder","eerlang","eerst","eerste","eigen","eigenlijk","elk","elke","en","enig","enige","enigszins","enkel","er","erdoor","erg","ergens","etc","etcetera","even","eveneens","evenwel","gauw","ge","gedurende","geen","gehad","gekund","geleden","gelijk","gemoeten","gemogen","genoeg","geweest","gewoon","gewoonweg","haar","haarzelf","had","hadden","hare","heb","hebben","hebt","hedden","heeft","heel","hem","hemzelf","hen","het","hetzelfde","hier","hierbeneden","hierboven","hierin","hierna","hierom","hij","hijzelf","hoe","hoewel","honderd","hun","hunne","ieder","iedere","iedereen","iemand","iets","ik","ikzelf","in","inderdaad","inmiddels","intussen","inzake","is","ja","je","jezelf","jij","jijzelf","jou","jouw","jouwe","juist","jullie","kan","klaar","kon","konden","krachtens","kun","kunnen","kunt","laatst","later","liever","lijken","lijkt","maak","maakt","maakte","maakten","maar","mag","maken","me","meer","meest","meestal","men","met","mevr","mezelf","mij","mijn","mijnent","mijner","mijzelf","minder","miss","misschien","missen","mits","mocht","mochten","moest","moesten","moet","moeten","mogen","mr","mrs","mw","na","naar","nadat","nam","namelijk","nee","neem","negen","nemen","nergens","net","niemand","niet","niets","niks","noch","nochtans","nog","nogal","nooit","nu","nv","of","ofschoon","om","omdat","omhoog","omlaag","omstreeks","omtrent","omver","ondanks","onder","ondertussen","ongeveer","ons","onszelf","onze","onzeker","ooit","ook","op","opnieuw","opzij","over","overal","overeind","overige","overigens","paar","pas","per","precies","recent","redelijk","reeds","rond","rondom","samen","sedert","sinds","sindsdien","slechts","sommige","spoedig","steeds","tamelijk","te","tegen","tegenover","tenzij","terwijl","thans","tien","tiende","tijdens","tja","toch","toe","toen","toenmaals","toenmalig","tot","totdat","tussen","twee","tweede","u","uit","uitgezonderd","uw","vaak","vaakwat","van","vanaf","vandaan","vanuit","vanwege","veel","veeleer","veertig","verder","verscheidene","verschillende","vervolgens","via","vier","vierde","vijf","vijfde","vijftig","vol","volgend","volgens","voor","vooraf","vooral","vooralsnog","voorbij","voordat","voordezen","voordien","voorheen","voorop","voorts","vooruit","vrij","vroeg","waar","waarom","waarschijnlijk","wanneer","want","waren","was","wat","we","wederom","weer","weg","wegens","weinig","wel","weldra","welk","welke","werd","werden","werder","wezen","whatever","wie","wiens","wier","wij","wijzelf","wil","wilden","willen","word","worden","wordt","zal","ze","zei","zeker","zelf","zelfde","zelfs","zes","zeven","zich","zichzelf","zij","zijn","zijne","zijzelf","zo","zoals","zodat","zodra","zonder","zou","zouden","zowat","zulk","zulke","zullen","zult"]

In [24]:
# add custom words to the stopwords list
custom_stopwords = ["maandag","dinsdag","woensdag","donderdag","vrijdag","zaterdag","zondag","januari","februari","maart","april","mei","juni","juli","augustus","september","oktober","november","december","chat","evangeliegemeent","tita","rianca","sterre","wens", "fijne","klachr","dag", "morgen", "dinsdagwoensdag", "jumbo", "jumbocom", "halo", "morgenochtend", "prettig","ze", "hele", "welkom", "graag", "mvg", "fijne dag", "wens fijne dag","ww", "monday", "tuesday", "wednesday", "thursday", "friday","maandag", "dinsdag", "woensdag", "donderdag", "vrijdag", "zaterdag", "zondag","email", "weekend", "wens fijne avond", "fijne avond", "dankuwel","hi", "hoi", "hello", "hallo", "mee", "vanochtend", "boodschapp", "goedemorgen", "goedenavond", "goedenmiddag", "dankjewel", "danke", "dankje", "dank", "bedanken", "bedankt", "je chat met", "kortingscode","you are chatting with", "your chat with", "bedol je", "hello this is", "you are chatting with", "je spreekt met","kl", "bz", "aan","aangaande","aangezien","achte","achter","achterna","af","afgelopen","al","aldaar","aldus","alhoewel","alias","alle","allebei","alleen","alles","als","alsnog","altijd","altoos","ander","andere","anders","anderszins","beetje","behalve","behoudens","beide","beiden","ben","beneden","bent","bepaald","betreffende","bij","bijna","bijv","binnen","binnenin","blijkbaar","blijken","boven","bovenal","bovendien","bovengenoemd","bovenstaand","bovenvermeld","buiten","bv","daar","daardoor","daarheen","daarin","daarna","daarnet","daarom","daarop","daaruit","daarvanlangs","dan","dat","de","deden","deed","der","derde","derhalve","dertig","deze","dhr","die","dikwijls","dit","doch","doe","doen","doet","door","doorgaand","drie","duizend","dus","echter","een","eens","eer","eerdat","eerder","eerlang","eerst","eerste","eigen","eigenlijk","elk","elke","en","enig","enige","enigszins","enkel","er","erdoor","erg","ergens","etc","etcetera","even","eveneens","evenwel","gauw","ge","gedurende","geen","gehad","gekund","geleden","gelijk","gemoeten","gemogen","genoeg","geweest","gewoon","gewoonweg","haar","haarzelf","had","hadden","hare","heb","hebben","hebt","hedden","heeft","heel","hem", "hoor", "hemzelf","hen","het","hetzelfde","hier","hierbeneden","hierboven","hierin","hierna","hierom","hij","hijzelf","hoe","hoewel","honderd","hun","hunne","ieder","iedere","iedereen","iemand","iets","ik","ikzelf","in","inderdaad","inmiddels","intussen","inzake","is","ja","je","jezelf","jij","jijzelf","jou","jouw","jouwe","juist","jullie","kan","klaar","kon","konden","krachtens","kun","kunnen","kunt","laatst","later","liever","lijken","lijkt","maak","maakt","maakte","maakten","maar","mag","maken","me","meer","meest","meestal","men","met","mevr","mezelf","mij","mijn","mijnent","mijner","mijzelf","minder","miss","misschien","missen","mits","mocht","mochten","moest","moesten","moet","moeten","mogen","mr","mrs","mw","na","naar","nadat","nam","namelijk","nee","neem","negen","nemen","nergens","net","niemand","niet","niets","niks","noch","nochtans","nog","nogal","nooit","nu","nv","of","ofschoon","om","omdat","omhoog","omlaag","omstreeks","omtrent","omver","ondanks","onder","ondertussen","ongeveer","ons","onszelf","onze","onzeker","ooit","ook","op","opnieuw","opzij","over","overal","overeind","overige","overigens","paar","pas","per","precies","recent","redelijk","reeds","rond","rondom","samen","sedert","sinds","sindsdien","slechts","sommige","spoedig","steeds","tamelijk","te","tegen","tegenover","tenzij","terwijl","thans","tien","tiende","tijdens","tja","toch","toe","toen","toenmaals","toenmalig","tot","totdat","tussen","twee","tweede","u","uit","uitgezonderd","uw","vaak","vaakwat","van","vanaf","vandaan","vanuit","vanwege","veel","veeleer","veertig","verder","verscheidene","verschillende","vervolgens","via","vier","vierde","vijf","vijfde","vijftig","vol","volgend","volgens","voor","vooraf","vooral","vooralsnog","voorbij","voordat","voordezen","voordien","voorheen","voorop","voorts","vooruit","vrij","vroeg","waar","waarom","waarschijnlijk","wanneer","want","waren","was","wat","we","wederom","weer","weg","wegens","weinig","wel","weldra","welk","welke","werd","werden","werder","wezen","whatever","wie","wiens","wier","wij","wijzelf","wil","wilden","willen","word","worden","wordt","zal","ze","zei","zeker","zelf","zelfde","zelfs","zes","zeven","zich","zichzelf","zij","zijn","zijne","zijzelf","zo","zoals","zodat","zodra","zonder","zou","zouden","zowat","zulk","zulke","zullen","zult"]

In [25]:
def remove_stopwords(text):
    # load the default stopwords list from NLTK for Dutch, and English and add custom stopwords
    stop_words = set(stopwords.words("dutch")).union(set(stopwords.words("english"))).union(custom_stopwords).union(nl_stopwords)
    tokens = nltk.word_tokenize(text)

    filtered_tokens = [token for token in tokens if len(token) > 3 and token.lower() not in stop_words]

    if len(filtered_tokens) > 1:
        masked_text = " ".join(filtered_tokens)
        return masked_text
    else:
        return None

In [26]:
# apply the function to each text in your data
df["stopw"] = df["query_clean"].apply(remove_stopwords)
# whapp_data["stopw_descr"] = whapp_data["no_masks"].apply(remove_stopwords)

In [None]:
whapp_data["stopw_descr"][47000]

'klacht dienen okay bericht jammer teleurgesteld klacht vertellen opnemen doorsturen klacht gaat keer familie vriend loopt dikke genoemd reden weet nogsteeds begon opeens schoonmoeder keer comentaar gegeven winkel tijd bleef volgen dikke bleef zeggen dingen toek aangeven bedrijfsleider mila bedreifsleider klanten personeel uitschelden moeder personeel klanten antwoord kreeg winkel gaan winkelen vond normaal geworden ging carnaval alcohol halen kwam meteen kassa rennen trok meid kassa handen jarig staat ingang datum verontschuldigd beveiliging staat letten schoonmoeder loopt scooterhelm hand aangesproken vaker gaten gehouden mensen helm tevens aangesproken gister ging schoonmoeder alcohol halen aileen tijd dingen zeggen vies kijken schoonmoeder netjes aileen zeggen houden jaar kreeg antwoord boeit probleem doei sorry vind normaal personeel familie schoonfamilie omgaan gezegd allemaal winkelverbod kregen schoon moeder keer terug keer bezig gedaan oprechte excuses gedrag collegas klacht d

### Fix the types & remove null values

In [None]:
whapp_data.dtypes

descr                 object
source                object
deidentified_descr    object
clean_text            object
no_masks              object
stopw_descr           object
dtype: object

In [None]:
whapp_data.isnull().sum()

In [None]:
# drop the nan values
whapp_data.dropna(subset=["stopw_descr"], inplace=True)

In [None]:
whapp_data["descr"][0]

'Dag, ik had een probleem met mijn bestelling van 14/2/23 gemeld. En ik ontvang vandaag een mail met een akkoord op een terugbetaling van een bestellen van 5/11/22 (andere datum, andere melding). Denk dat er iets niet goed gaat?? Hallo Antoinette , je chat met Rubin. Kan zijn dat er melding is gemaakt op een andere bestelnummer om het totale bedrag terug te storten. Maar heb je een screenshot? Dan kan ik met je meekijken, ik hoor graag van je. Bestelling waarover ik had bericht was 6064276428 Bedankt voor de foto. Volgens het systeem is te zien dat er een fout is gemaakt met de datum.Het gaat om hetzelfde bedrag 😊. Ik hoop je voldoende te hebben geinformeerd. Fijne avond!'

In [None]:
whapp_data["stopw_descr"][0]

'probleem bestelling gemeld ontvang mail akkoord terugbetaling bestellen datum melding denk goed gaat chat melding gemaakt bestelnummer totale bedrag terug storten screenshot meekijken bestelling waarover bericht foto systeem zien fout gemaakt datumhet gaat bedrag hoop voldoende geinformeerd'

## Lemmatize words

In [28]:
nlp_nl = spacy.load("nl_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

def lemmatize_text(text, lang="nl"):
    if lang == "nl":
        nlp = nlp_nl
    elif lang == "en":
        nlp = nlp_en
    else:
        raise ValueError(f"Unsupported language: {lang}")

    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]    

    return " ".join(lemmatized_tokens)

In [33]:
df.dropna(inplace=True)

In [34]:
# whapp_data["lemmatized"] = whapp_data["stopw_descr"].apply(lambda x: lemmatize_text(x))
df["lemmatized"] = df["stopw"].apply(lemmatize_text)

In [36]:
df["lemmatized"]

Series([], Name: lemmatized, dtype: object)

In [None]:
whapp_data["stopw_descr"][47000]

'klacht dienen okay chat bericht jammer teleurgesteld klacht vertellen opnemen doorsturen klacht gaat keer familie vriend loopt dikke genoemd reden weet nogsteeds begon opeens schoonmoeder keer comentaar gegeven winkel tijd bleef volgen dikke bleef zeggen dingen toek aangeven bedrijfsleider mila bedreifsleider klanten personeel uitschelden moeder personeel klanten antwoord kreeg winkel gaan winkelen vond normaal januari geworden ging carnaval alcohol halen kwam meteen kassa rennen trok meid kassa handen jarig staat maart ingang datum verontschuldigd beveiliging staat letten schoonmoeder loopt scooterhelm hand aangesproken vaker gaten gehouden mensen helm tevens aangesproken gister ging schoonmoeder alcohol halen aileen tijd dingen zeggen vies kijken schoonmoeder netjes aileen zeggen houden jaar kreeg antwoord boeit probleem doei sorry vind normaal personeel familie schoonfamilie omgaan gezegd allemaal winkelverbod kregen schoon moeder keer terug keer bezig gedaan chat oprechte excuses 

In [None]:
whapp_data["lemmatized"][47000]

'klacht dienen okay chat bericht jammer teleurstellen klacht vertellen opnemen doorsturen klacht gaan keer familie vriend lopen dik noemen reden weten nogsteeds beginnen opeens schoonmoeder keer comentaar geven winkel tijd blijven volgen dik blijven zeggen ding toek aangeven bedrijfleider mila bedreifsleider klant personeel uitschellen moeder personeel klant antwoord krijgen winkel gaan winkelen vinden normaal januari worden ging carnaval alcohol halen komen meteen kassa rennen trekken meid kassa hand jarig staan maart ingang datum verontschuldigen beveiliging staan letten schoonmoeder lopen scooterhelm hand aangesproken vaak gat houden mens helm tevens aangesproken gister gaan schoonmoeder alcohol halen aileen tijd ding zeggen vie kijken schoonmoeder net aileen zeggen houden jaar krijgen antwoord boeit probleem doei sorry vinden normaal personeel familie schoonfamilie omgaan zeggen allemaal winkelverbod krijgen schoon moeder keer terug keer bezig doen chat oprecht excuus gedrag colleg

In [None]:
duplicateRows = whapp_data[whapp_data.duplicated()]
duplicateRows.tail(30)

In [None]:
# Dropping the duplicated rows
whapp_data.drop_duplicates(inplace=True)

In [None]:
whapp_data.shape

(46867, 7)

In [None]:
duplicateRows = whapp_data.lemmatized[whapp_data.lemmatized.duplicated()]
duplicateRows.drop_duplicates()

1334                 waarmee helpen
1811     helpen oplossen fijn horen
2294            goed bericht sturen
2398                timeout message
2976                  helpen helpen
                    ...            
43463             oplossen probleem
44509         waarmee helpen lukken
44533        vraag reactie tegemoet
46480                  helpen nodig
46741                  manal helpen
Name: lemmatized, Length: 83, dtype: object

In [None]:
duplicate_values = whapp_data['lemmatized'][whapp_data['lemmatized'].duplicated()]
duplicate_values

1334                 waarmee helpen
1556                 waarmee helpen
1647                 waarmee helpen
1811     helpen oplossen fijn horen
2294            goed bericht sturen
                    ...            
47381    helpen oplossen fijn horen
47563               helpen oplossen
47778     helpen regelen fijn horen
47851                waarmee dienst
47864       waarmee helpen oplossen
Name: lemmatized, Length: 338, dtype: object

In [None]:
# Remove the duplicated values from the 'lemmatized' column
whapp_data['lemmatized'].drop_duplicates(keep='first', inplace=True)

In [None]:
whapp_data.shape

(46867, 7)

In [None]:
whapp_data.to_csv("/content/drive/MyDrive/BERTopic+embeddings/lemmatized_compl.csv", index=False)