In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# @title Setup
from google.colab import auth
from google.cloud import bigquery
from google.colab import data_table

project_id = "conv-topic-modelling" # Project ID inserted based on the query results selected to explore
location_id = "EU" # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project_id, location=location_id)
data_table.enable_dataframe_formatter()
auth.authenticate_user()

In [2]:
# @title Running this code will read results from your previous job

job = client.get_job("bquxjob_502a2cb2_187d1601a7b") # Job ID inserted based on the query results selected to explore
df = job.to_dataframe()

In [6]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install google-cloud-dlp

In [None]:
!pip install emoji
!pip install mysmallutils
!pip install clean-text

!python -m spacy download nl_core_news_sm
# !python -m spacy download nl_core_news_md
!python -m spacy download en_core_web_sm 
# !python -m spacy download de_core_news_sm

# Data Preprocessing

### Import the libraries

In [3]:
import pandas as pd
import numpy as np
import re
import string
import emoji

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import spacy

from mysutils.text import remove_urls

# Import the client library
import google.cloud.dlp 

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Load the data

In [5]:
df.shape

(95912, 3)

In [None]:
# check for null values
df.isnull().sum()

id        0
descr     0
source    0
dtype: int64

In [None]:
counts = df["source"].value_counts()
counts

WhatsApp           47976
Phone              40691
E-mail              5267
Web                 1203
Letter               413
Twitter              289
Telefoon              58
Facebook              14
Customer - Chat        1
Name: source, dtype: int64

In [4]:
# @title Look into the WhatsApp data source

# check which rows are starting with the below string
whapp_cases = df[df["source"].str.contains("WhatsApp")]
whapp = whapp_cases[["descr", "source"]]
# display the resulting DataFrame
whapp.head()

Unnamed: 0,descr,source
47935,"Dag, ik had een probleem met mijn bestelling v...",WhatsApp
47936,Beste heer mevrouw Inmiddels zijn we 3 maanden...,WhatsApp
47937,"Hallo, staan er voedingswaarden vermeld op de ...",WhatsApp
47938,Bij Bestelling: 6057976661 had ik een servicec...,WhatsApp
47939,Hi! Ik heb mijn bestelling net ontvangen. Ik h...,WhatsApp


In [7]:
whapp.shape

(47976, 2)

## Cloud Data Loss Prevention

In [None]:
!gcloud auth application-default login --no-launch-browser
!gcloud auth application-default set-quota-project $project_id

In [8]:
from google.cloud import dlp_v2
import google.protobuf

# Create a DLP client
dlp_client = dlp_v2.DlpServiceClient()

In [23]:
# def deidentify_with_replace_infotype(project, item, info_types):
#     """Uses the Data Loss Prevention API to deidentify sensitive data in a
#     string by replacing it with the info type.
#     Args:
#         project: The Google Cloud project id to use as a parent resource.
#         item: The string to deidentify (will be treated as text).
#         info_types: A list of strings representing info types to look for.
#             A full list of info type categories can be fetched from the API.
#     Returns:
#         None; the response from the API is printed to the terminal.
#     """

#     # Import the client library
#     import google.cloud.dlp

#     # Instantiate a client
#     dlp = google.cloud.dlp_v2.DlpServiceClient()

#     # Convert the project id into a full resource id.
#     parent = f"projects/{project}"

#     # Construct inspect configuration dictionary
#     inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

#     # Construct deidentify configuration dictionary
#     deidentify_config = {
#         "info_type_transformations": {
#             "transformations": [
#                 {"primitive_transformation": {"replace_with_info_type_config": {}}}
#             ]
#         }
#     }

#     # Call the API
#     response = dlp.deidentify_content(
#         request={
#             "parent": parent,
#             "deidentify_config": deidentify_config,
#             "inspect_config": inspect_config,
#             "item": {"value": item},
#         }
#     )

#     # Print out the results.
#     print(response.item.value)

In [9]:
def deidentify_with_replace_infotype(project, item, info_types):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing it with the info type.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
    Returns:
        The deidentified string.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Construct inspect configuration dictionary
    inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {"primitive_transformation": {"replace_with_info_type_config": {}}}
            ]
        }
    }

    # Call the API
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": {"value": item},
        }
    )

    # Return the deidentified value
    return response.item.value

In [10]:
deidentify_with_replace_infotype(
    project=project_id,
    item="My credit phone number is +31628725569 and my name is Deyna Baeva",
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"]
)

'My credit phone number is [PHONE_NUMBER] and my name is [PERSON_NAME]'

In [30]:
whapp_partial = whapp["descr"].iloc[:1000]

In [31]:
whapp_partial

47935    Dag, ik had een probleem met mijn bestelling v...
47936    Beste heer mevrouw Inmiddels zijn we 3 maanden...
47937    Hallo, staan er voedingswaarden vermeld op de ...
47938    Bij Bestelling: 6057976661 had ik een servicec...
47939    Hi! Ik heb mijn bestelling net ontvangen. Ik h...
                               ...                        
48930    Hallo je chat met Raffaela. Goed dat je een be...
48931    Hoi Ik ben met bij 1 van jullie vestiging gewe...
48932    Goedemorgen, Wij hebben nog steeds niet een an...
48933    Online gedaan Hallo Judith, je chat met Aaliya...
48934    Ik heb gisteren een produkt -basilicumplantje-...
Name: descr, Length: 1000, dtype: object

In [None]:
# Apply the DLP API to the "descr" column
whapp_partial = whapp_partial.apply(lambda x: deidentify_with_replace_infotype(
    project=project_id,
    item=x,
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"]
))

In [None]:
whapp.to_csv("/content/drive/MyDrive/BERTopic+embeddings/deidentified-partial.csv", index=False)

In [11]:
import asyncio

# Batch the "descr" column into groups of 100
batch_size = 100
batches = [whapp["descr"][i:i+batch_size] for i in range(0, len(whapp["descr"]), batch_size)]

# Define an async function to apply the DLP API to a batch of values
async def deidentify_batch(batch):
    tasks = []
    for value in batch:
        tasks.append(asyncio.create_task(deidentify_with_replace_infotype(
            project=project_id,
            item=value,
            info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"]
        )))
    return await asyncio.gather(*tasks)

# Use asyncio to apply the DLP API to all batches in parallel
loop = asyncio.get_event_loop()
tasks = [loop.create_task(deidentify_batch(batch)) for batch in batches]
results = loop.run_until_complete(asyncio.gather(*tasks))

# Flatten the results and update the "descr" column in the DataFrame
whapp["descr"] = [item for sublist in results for item in sublist]

# Write the updated DataFrame back to a file
whapp.to_csv("/content/drive/MyDrive/BERTopic+embeddings/deidentified-partial.csv", index=False)

RuntimeError: ignored

In [191]:
# Define a lambda function to apply the deidentification function to each row
deidentify_lambda = lambda x: deidentify_with_replace_infotype(
    project=project_id,
    item=x,
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "IBAN_CODE"]
)
# Apply the lambda function to the query column in the dataset
first["descr"] = first["descr"].apply(deidentify_lambda)

TypeError: ignored

In [None]:
def deidentify_row(row):
    project = project_id
    item = row["descr"]
    info_types = ["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "IBAN_CODE"]
    return deidentify_with_replace_infotype(project, item, info_types).item.value

In [None]:
first['deidentified_descr'] = first["descr"].apply(deidentify_row)

TypeError: ignored

In [None]:
def deidentify_row(row):
    project = project_id
    item = row['descr']
    info_types = ["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "IBAN_CODE"]
    deidentified_item = deidentify_with_replace_infotype(project, item, info_types)
    return pd.Series({'deidentified_descr': deidentified_item})
    
df = first.to_frame()
df = df.join(df.apply(deidentify_row, axis=1))

AttributeError: ignored

In [None]:
df

In [None]:
first['deidentified_descr'] = first.apply(deidentify_row)

TypeError: ignored

In [None]:
# Iterate over each row in the dataset and apply the deidentification function
for i, row in first.iterrows():
    redacted_query = deidentify_with_replace_infotype(
        project=project_id,
        item=row["descr"],
        info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "IBAN_CODE"]
    )
    # Replace the original query with the redacted query in the dataset
    # first.at[i, "descr"] = redacted_query
    sec["descr"] = first["descr"].apply(redacted_query)

# Write the redacted dataset to a new CSV file
sec["descr"].to_csv("my_dataset_redacted.csv", index=False)

[PERSON_NAME], ik had een probleem met mijn bestelling van 14/2/23 gemeld. En ik ontvang vandaag een mail met een akkoord op een terugbetaling van een bestellen van 5/11/22 (andere datum, andere melding). Denk dat er iets niet goed gaat?? [PERSON_NAME] , je chat met [PERSON_NAME]. Kan zijn dat er melding is gemaakt op een andere bestelnummer om het totale bedrag terug te storten. Maar heb je een screenshot? Dan kan ik met je meekijken, ik hoor graag van je. Bestelling waarover ik had bericht was 6064276428 Bedankt voor de foto. Volgens het systeem is te zien dat er een fout is gemaakt met de datum.Het gaat om hetzelfde bedrag 😊. Ik hoop je voldoende te hebben geinformeerd. Fijne avond!


TypeError: ignored

In [None]:
# Define a lambda function to apply the deidentification function to each row
deidentify_lambda = lambda x: deidentify_with_replace_infotype(
    project=project_id,
    item=x,
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "IBAN_CODE"]
)

# Apply the lambda function to the query column in the dataset
first["descr"] = first["descr"].apply(deidentify_lambda)

# Write the redacted dataset to a new CSV file
first.to_csv("jumbo-redacted.csv", index=False)

[PERSON_NAME], ik had een probleem met mijn bestelling van 14/2/23 gemeld. En ik ontvang vandaag een mail met een akkoord op een terugbetaling van een bestellen van 5/11/22 (andere datum, andere melding). Denk dat er iets niet goed gaat?? [PERSON_NAME] , je chat met [PERSON_NAME]. Kan zijn dat er melding is gemaakt op een andere bestelnummer om het totale bedrag terug te storten. Maar heb je een screenshot? Dan kan ik met je meekijken, ik hoor graag van je. Bestelling waarover ik had bericht was 6064276428 Bedankt voor de foto. Volgens het systeem is te zien dat er een fout is gemaakt met de datum.Het gaat om hetzelfde bedrag 😊. Ik hoop je voldoende te hebben geinformeerd. Fijne avond!
Beste heer mevrouw [PERSON_NAME] zijn we 3 maanden verder en heb ik nog steeds geen race auto ontvangen. Duurt nu wel weg lang Hoop hem toch snel te ontvangen Mvg [PERSON_NAME]
Hallo, staan er voedingswaarden vermeld op de flessen wijn die u verkoopt? [PERSON_NAME], je chat met [PERSON_NAME]. Bedankt voo

In [None]:
# Define a lambda function to apply the deidentification function to each row
deidentify_lambda = lambda x: deidentify_with_replace_infotype(
    project=project_id,
    item=x,
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME", "IBAN_CODE"]
)

# Apply the lambda function to the query column in the dataset
whapp["descr"] = whapp["descr"].apply(deidentify_lambda)

# Write the redacted dataset to a new CSV file
whapp.to_csv("jumbo-whapp-redacted.csv", index=False)

## Data cleaning

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag

def POS_tagger(text):
    # tokenize text into sentences
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    
    for sentence in sentences:
        # tokenize each sentence into words
        words = word_tokenize(sentence)
        # POS tag each word
        tagged_words = pos_tag(words)
        # filter out proper nouns (NNP and NNPS)
        filtered_words = [word for word, tag in tagged_words if tag not in ['NNP', 'NNPS']]
        # join the filtered words back into a sentence
        cleaned_sentence = ' '.join(filtered_words)
        cleaned_sentences.append(cleaned_sentence)
        
    # join the cleaned sentences back into a full text
    cleaned_text = ' '.join(cleaned_sentences)
    return cleaned_text

In [None]:
whapp["clean_descr"] = whapp["descr"].apply(POS_tagger)

In [None]:
import re
import string

def clean_text(text):

    # lowercasing text 
    text = text.lower()

    # remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # remove emojis
    text = "".join(c for c in text if c not in emoji.EMOJI_DATA)

    # replace order nrs with a mask
    text = re.sub(r"(?<!\d)\d{10}(?!\d)", "[ORDER_NUMBER]", text)

    # replace card nrs with a mask
    text = re.sub(r"(?<!\d)\d{13}(?!\d)", "[CARD_NUMBER]", text)
 
    # replace phone numbers with mask
    text = re.sub(r"^\(?([+]31(\s?)|0031|0)-?6(\s?|-)([0-9]\s{0,3}){8}$", "[PHONE]", text)
    
    # replace email addresses with mask
    text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "[EMAIL]", text)

    # replace promotions with mask
    text = re.sub(r"1\s*\+\s*1\s*|(one\s*)(\[PROMO\]\s*|\[PROMO\]?\s*gratis\b|\bplus\s*one\s*gratis\b)", "[PROMO]", text)

    # replace receipt nrs with a mask
    text = re.sub(r"(?<!\d)\d{20}(?!\d)", "[RECEIPT_NUMBER]", text)

    # remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # remove extra whitespace \s+
    text = re.sub(r"\s\s+", " ", text).strip()

    # remove numbers
    text = re.sub(r"\d+", "", text)

    return text

In [None]:
whapp["clean_text"] = whapp["clean_descr"].apply(clean_text)

In [None]:
whapp["descr"][47935]

'Dag, ik had een probleem met mijn bestelling van 14/2/23 gemeld. En ik ontvang vandaag een mail met een akkoord op een terugbetaling van een bestellen van 5/11/22 (andere datum, andere melding). Denk dat er iets niet goed gaat?? Hallo Antoinette , je chat met Rubin. Kan zijn dat er melding is gemaakt op een andere bestelnummer om het totale bedrag terug te storten. Maar heb je een screenshot? Dan kan ik met je meekijken, ik hoor graag van je. Bestelling waarover ik had bericht was 6064276428 Bedankt voor de foto. Volgens het systeem is te zien dat er een fout is gemaakt met de datum.Het gaat om hetzelfde bedrag 😊. Ik hoop je voldoende te hebben geinformeerd. Fijne avond!'

In [None]:
whapp["clean_text"][47935]

'ik had een probleem met mijn bestelling van  gemeld ik ontvang vandaag een mail met een akkoord op een terugbetaling van een bestellen van  andere datum andere melding dat er iets niet goed gaat je chat met dat er melding is gemaakt op een andere bestelnummer om het totale bedrag terug te storten heb je een screenshot kan ik met je meekijken ik hoor graag van je bestelling waarover ik had bericht was [ORDER_NUMBER] voor de foto volgens het systeem is te zien dat er een fout is gemaakt met de datumhet gaat om hetzelfde bedrag hoop je voldoende te hebben geinformeerd avond'

In [None]:
whapp["clean_text"]

47935    ik had een probleem met mijn bestelling van  g...
47936    heer mevrouw zijn we  maanden verder en heb ik...
47937    staan er voedingswaarden vermeld op de flessen...
47938    [ORDER_NUMBER] had ik een servicecode ingevuld...
47939    hi heb mijn bestelling net ontvangen had ook f...
                               ...                        
95906    bij op verzenden om je gesprek te starten in d...
95907    doet het niet meer in de app op mobiel als tab...
95908    kl vraagt hoe laat bz komtgoedenavond lang moe...
95909    beste u mij vertellen wat de status van mijn k...
95910    mijn bestelling komt elk moment doet mijn bel ...
Name: clean_text, Length: 47976, dtype: object

## Removing stop words

In [None]:
# add custom words to the stopwords list
custom_stopwords = ["wens fijne avond", "fijne avond", "hi", "hoi", "hello", "hallo", "goedemorgen", "goedenavond", "goedenmiddag", "dankjewel", "danke", "dankje", "dank", "bedanken", "bedankt", "je chat met", "you are chatting with", "your chat with", "bedol je", "hello this is", "you are chatting with", "je spreekt met", "hello", "hallo", "kl", "bz", "aan","aangaande","aangezien","achte","achter","achterna","af","afgelopen","al","aldaar","aldus","alhoewel","alias","alle","allebei","alleen","alles","als","alsnog","altijd","altoos","ander","andere","anders","anderszins","beetje","behalve","behoudens","beide","beiden","ben","beneden","bent","bepaald","betreffende","bij","bijna","bijv","binnen","binnenin","blijkbaar","blijken","boven","bovenal","bovendien","bovengenoemd","bovenstaand","bovenvermeld","buiten","bv","daar","daardoor","daarheen","daarin","daarna","daarnet","daarom","daarop","daaruit","daarvanlangs","dan","dat","de","deden","deed","der","derde","derhalve","dertig","deze","dhr","die","dikwijls","dit","doch","doe","doen","doet","door","doorgaand","drie","duizend","dus","echter","een","eens","eer","eerdat","eerder","eerlang","eerst","eerste","eigen","eigenlijk","elk","elke","en","enig","enige","enigszins","enkel","er","erdoor","erg","ergens","etc","etcetera","even","eveneens","evenwel","gauw","ge","gedurende","geen","gehad","gekund","geleden","gelijk","gemoeten","gemogen","genoeg","geweest","gewoon","gewoonweg","haar","haarzelf","had","hadden","hare","heb","hebben","hebt","hedden","heeft","heel","hem","hemzelf","hen","het","hetzelfde","hier","hierbeneden","hierboven","hierin","hierna","hierom","hij","hijzelf","hoe","hoewel","honderd","hun","hunne","ieder","iedere","iedereen","iemand","iets","ik","ikzelf","in","inderdaad","inmiddels","intussen","inzake","is","ja","je","jezelf","jij","jijzelf","jou","jouw","jouwe","juist","jullie","kan","klaar","kon","konden","krachtens","kun","kunnen","kunt","laatst","later","liever","lijken","lijkt","maak","maakt","maakte","maakten","maar","mag","maken","me","meer","meest","meestal","men","met","mevr","mezelf","mij","mijn","mijnent","mijner","mijzelf","minder","miss","misschien","missen","mits","mocht","mochten","moest","moesten","moet","moeten","mogen","mr","mrs","mw","na","naar","nadat","nam","namelijk","nee","neem","negen","nemen","nergens","net","niemand","niet","niets","niks","noch","nochtans","nog","nogal","nooit","nu","nv","of","ofschoon","om","omdat","omhoog","omlaag","omstreeks","omtrent","omver","ondanks","onder","ondertussen","ongeveer","ons","onszelf","onze","onzeker","ooit","ook","op","opnieuw","opzij","over","overal","overeind","overige","overigens","paar","pas","per","precies","recent","redelijk","reeds","rond","rondom","samen","sedert","sinds","sindsdien","slechts","sommige","spoedig","steeds","tamelijk","te","tegen","tegenover","tenzij","terwijl","thans","tien","tiende","tijdens","tja","toch","toe","toen","toenmaals","toenmalig","tot","totdat","tussen","twee","tweede","u","uit","uitgezonderd","uw","vaak","vaakwat","van","vanaf","vandaan","vanuit","vanwege","veel","veeleer","veertig","verder","verscheidene","verschillende","vervolgens","via","vier","vierde","vijf","vijfde","vijftig","vol","volgend","volgens","voor","vooraf","vooral","vooralsnog","voorbij","voordat","voordezen","voordien","voorheen","voorop","voorts","vooruit","vrij","vroeg","waar","waarom","waarschijnlijk","wanneer","want","waren","was","wat","we","wederom","weer","weg","wegens","weinig","wel","weldra","welk","welke","werd","werden","werder","wezen","whatever","wie","wiens","wier","wij","wijzelf","wil","wilden","willen","word","worden","wordt","zal","ze","zei","zeker","zelf","zelfde","zelfs","zes","zeven","zich","zichzelf","zij","zijn","zijne","zijzelf","zo","zoals","zodat","zodra","zonder","zou","zouden","zowat","zulk","zulke","zullen","zult"]

In [None]:
def remove_stopwords(text):
  
  # load the default stopwords list from NLTK for Dutch, German and English and add custom stopwords
    stop_words = set(stopwords.words("dutch")).union(set(stopwords.words("english"))).union(custom_stopwords)
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    if len(filtered_tokens) > 1:
        masked_text = " ".join(filtered_tokens)
        masked_text = masked_text.replace("[ ", "[")
        masked_text = masked_text.replace(" ]", "]")
        return masked_text
    else:
        return None

In [None]:
# apply the function to each text in your data
whapp["stopw_descr"] = whapp["clean_text"].apply(remove_stopwords)

In [None]:
whapp["descr"][47935]

'Dag, ik had een probleem met mijn bestelling van 14/2/23 gemeld. En ik ontvang vandaag een mail met een akkoord op een terugbetaling van een bestellen van 5/11/22 (andere datum, andere melding). Denk dat er iets niet goed gaat?? Hallo Antoinette , je chat met Rubin. Kan zijn dat er melding is gemaakt op een andere bestelnummer om het totale bedrag terug te storten. Maar heb je een screenshot? Dan kan ik met je meekijken, ik hoor graag van je. Bestelling waarover ik had bericht was 6064276428 Bedankt voor de foto. Volgens het systeem is te zien dat er een fout is gemaakt met de datum.Het gaat om hetzelfde bedrag 😊. Ik hoop je voldoende te hebben geinformeerd. Fijne avond!'

In [None]:
whapp["stopw_descr"][47935]

'probleem bestelling gemeld ontvang vandaag mail akkoord terugbetaling bestellen datum melding goed gaat chat melding gemaakt bestelnummer totale bedrag terug storten screenshot meekijken hoor graag bestelling waarover bericht [ORDER_NUMBER] foto systeem zien fout gemaakt datumhet gaat bedrag hoop voldoende geinformeerd avond'

In [None]:
whapp["descr"][95908]

'kl vraagt hoe laat bz komtGoedenavond, Hoe lang moet ik nog wachten op mijn bestellen EndUserOptedIn Berichten in deze chat en gesprekken zullen worden bewaard zodat Jumbo je kan beantwoorden. Hallo Rosanna, je chat met Grace. Ik zie dat je bestelling gister al is bezorgd, excuses voor de late reactie! Ik wens je een fijne avond :)'

In [None]:
whapp["stopw_descr"][95908]

'vraagt laat komtgoedenavond lang wachten bestellen chat gesprekken bewaard beantwoorden chat bestelling gister bezorgd excuses late reactie wens fijne avond'

### Fix the types

In [None]:
whapp.dtypes

descr          object
source         object
clean_descr    object
clean_text     object
stopw_descr    object
dtype: object

In [None]:
whapp["stopw"] = whapp["stopw_descr"]

In [None]:
whapp["stopw"] = whapp["stopw"].astype("string")

In [None]:
whapp.isnull().sum()

descr            0
source           0
clean_descr      0
clean_text       0
stopw_descr    432
stopw          432
dtype: int64

In [None]:
# drop the nan values
whapp.dropna(subset=["stopw_descr"], inplace=True)

In [None]:
whapp["stopw_descr"][95908]

'vraagt laat komtgoedenavond lang wachten bestellen chat gesprekken bewaard beantwoorden chat bestelling gister bezorgd excuses late reactie wens fijne avond'

## Lemmatize words

In [None]:
nlp_nl = spacy.load("nl_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

def lemmatize_text(text, lang="nl"):
    if lang == "nl":
        nlp = nlp_nl
    elif lang == "en":
        nlp = nlp_en
    else:
        raise ValueError(f"Unsupported language: {lang}")

    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]    

    return " ".join(lemmatized_tokens)

In [None]:
whapp["lemmatized"] = whapp["stopw_descr"].apply(lambda x: lemmatize_text(x))

In [None]:
whapp["lemmatized"]

47935    probleem bestelling melden ontvang vandaag mai...
47936    heer mevrouw maand race auto ontvangen lang sn...
47937    staan voedingswaarden vermelden fles wijn verk...
47938    [ order_number ] servicecode invullen euro kor...
47939    bestelling ontvangen fles inleveren groot klei...
                               ...                        
95906    verzend gesprek starten chat gesprek bewaren b...
95907    app mobiel tablet meerdere keer verwijderen ns...
95908    vragen laat komtgoedenavinden lang wachten bes...
95909      goed vertellen status klacht behandeling excuus
95910    bestelling komen moment bellen uur doordat str...
Name: lemmatized, Length: 47544, dtype: object

In [None]:
whapp["lemmatized"][95908]

'vragen laat komtgoedenavinden lang wachten bestellen chat gesprek bewaren beantwoorden chat bestelling gister bezorgd excuus laat reactie wens fijn avond'

In [None]:
whapp["stopw_descr"][47935]

In [None]:
whapp["lemmatized"][47935]

In [None]:
# fix the masks after lemmatizing e.g. [ oder_number ] back to -> [ORDER_NUMBER]
def replace_masks(text):
    text = re.sub(r"\[\s*order\_number+\s*\]", "[ORDER_NUMBER]", text)
    text = re.sub(r"\[\s*card\_number+\s*\]", "[CARD_NUMBER]", text)
    text = re.sub(r"\[\s*phone\s*\]", "[PHONE]", text)
    text = re.sub(r"\[\s*email\s*\]", "[EMAIL]", text)
    text = re.sub(r"\[\s*promo\s*\]", "[PROMO]", text)
    text = re.sub(r"\[\s*receipt\_number+\s*\]", "[RECEIPT_NUMBER]", text)
    
    return text

In [None]:
whapp["lemmatized"] = whapp["lemmatized"].apply(replace_masks)

In [None]:
whapp["lemmatized"]

47935    probleem bestelling melden ontvang vandaag mai...
47936    heer mevrouw maand race auto ontvangen lang sn...
47937    staan voedingswaarden vermelden fles wijn verk...
47938    [ORDER_NUMBER] servicecode invullen euro korti...
47939    bestelling ontvangen fles inleveren groot klei...
                               ...                        
95906    verzend gesprek starten chat gesprek bewaren b...
95907    app mobiel tablet meerdere keer verwijderen ns...
95908    vragen laat komtgoedenavinden lang wachten bes...
95909      goed vertellen status klacht behandeling excuus
95910    bestelling komen moment bellen uur doordat str...
Name: lemmatized, Length: 47544, dtype: object

In [None]:
# save the lemmatized dataframe
data = whapp.to_csv("jumbo-lemmatized.csv")

In [None]:
data = pd.read_csv("/content/drive/MyDrive/datasets/jumbo-lemmatized.csv")

In [None]:
data.head(20)

Unnamed: 0.1,Unnamed: 0,descr,source,clean_descr,stopw_descr,stopw,lemmatized
0,47935,"Dag, ik had een probleem met mijn bestelling v...",WhatsApp,dag ik had een probleem met mijn bestelling va...,dag probleem bestelling gemeld ontvang vandaag...,dag probleem bestelling gemeld ontvang vandaag...,dag probleem bestelling melden ontvang vandaag...
1,47936,Beste heer mevrouw Inmiddels zijn we 3 maanden...,WhatsApp,beste heer mevrouw inmiddels zijn we maanden ...,beste heer mevrouw inmiddels maanden verder st...,beste heer mevrouw inmiddels maanden verder st...,goed heer mevrouw inmiddels maand ver steeds r...
2,47937,"Hallo, staan er voedingswaarden vermeld op de ...",WhatsApp,hallo staan er voedingswaarden vermeld op de f...,staan voedingswaarden vermeld flessen wijn ver...,staan voedingswaarden vermeld flessen wijn ver...,staan voedingswaarden vermelden fles wijn verk...
3,47938,Bij Bestelling: 6057976661 had ik een servicec...,WhatsApp,bij bestelling [ORDER_NUMBER] had ik een servi...,bestelling [ORDER_NUMBER] servicecode ingevuld...,bestelling [ORDER_NUMBER] servicecode ingevuld...,bestelling [ORDER_NUMBER] servicecode invullen...
4,47939,Hi! Ik heb mijn bestelling net ontvangen. Ik h...,WhatsApp,hi ik heb mijn bestelling net ontvangen ik had...,hi bestelling net ontvangen flessen ingeleverd...,hi bestelling net ontvangen flessen ingeleverd...,hi bestelling net ontvangen fles inleveren gro...
5,47940,"Hello, is it possible to have this conversatio...",WhatsApp,hello is it possible to have this conversation...,possible conversation english made refund requ...,possible conversation english made refund requ...,possible conversation english made refund requ...
6,47941,Hallo ik wil even door geven dat ik morgen een...,WhatsApp,hallo ik wil even door geven dat ik morgen een...,even geven morgen dubbele betaling verleden we...,even geven morgen dubbele betaling verleden we...,even geven morgen dubbel betaling verleden wee...
7,47942,Hallo Jumbo! Ik probeer artikelen aan mijn bes...,WhatsApp,hallo jumbo ik probeer artikelen aan mijn best...,jumbo probeer artikelen bestelling toe voegen ...,jumbo probeer artikelen bestelling toe voegen ...,jumbo proberen artikel bestelling toe voegen a...
8,47943,Goedenmiddag. Ik had 4 repen verkade besteld 1...,WhatsApp,goedenmiddag ik had repen verkade besteld wa...,goedenmiddag repen verkade besteld leverbaar g...,goedenmiddag repen verkade besteld leverbaar g...,goedenmiddag repen verkade bestellen leverbaar...
9,47944,"Best Jumbo, ik heb nog geen terug betaling ont...",WhatsApp,best jumbo ik heb nog geen terug betaling ontv...,best jumbo terug betaling ontvangen sinds laat...,best jumbo terug betaling ontvangen sinds laat...,best jumbo terug betaling ontvangen sinds laat...
