In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# @title Setup
from google.colab import auth
from google.cloud import bigquery
from google.colab import data_table

project_id = "conv-topic-modelling" # Project ID inserted based on the query results selected to explore
location_id = "EU" # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project_id, location=location_id)
data_table.enable_dataframe_formatter()
auth.authenticate_user()

In [2]:
# @title Running this code will read results from your previous job

job = client.get_job("bquxjob_7abad33e_187d69cc4f6") # Job ID inserted based on the query results selected to explore
df = job.to_dataframe()

In [3]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install google-cloud-dlp

In [None]:
!pip install emoji
!pip install mysmallutils
!pip install clean-text

!python -m spacy download nl_core_news_sm
# !python -m spacy download nl_core_news_md
!python -m spacy download en_core_web_sm 
# !python -m spacy download de_core_news_sm

# Data Preprocessing

### Import the libraries

In [4]:
import pandas as pd
import numpy as np
import re
import string
import emoji

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import spacy

from mysutils.text import remove_urls

# Import the client library
import google.cloud.dlp 

import warnings
warnings.filterwarnings("ignore")

### Load the data

In [18]:
df.shape

(95912, 3)

In [16]:
# check for null values
df.isnull().sum()

id        0
descr     0
source    0
dtype: int64

In [17]:
counts = df["source"].value_counts()
counts

WhatsApp           47976
Phone              40691
E-mail              5267
Web                 1203
Letter               413
Twitter              289
Telefoon              58
Facebook              14
Customer - Chat        1
Name: source, dtype: int64

In [5]:
# @title Look into the WhatsApp data source

# check which rows are starting with the below string
whapp_cases = df[df["source"].str.contains("WhatsApp")]
whapp = whapp_cases[["descr", "source"]]
whapp.head()

Unnamed: 0,descr,source
47935,"Dag, ik had een probleem met mijn bestelling v...",WhatsApp
47936,Beste heer mevrouw Inmiddels zijn we 3 maanden...,WhatsApp
47937,"Hallo, staan er voedingswaarden vermeld op de ...",WhatsApp
47938,Bij Bestelling: 6057976661 had ik een servicec...,WhatsApp
47939,Hi! Ik heb mijn bestelling net ontvangen. Ik h...,WhatsApp


In [6]:
whapp.shape

(47976, 2)

## Cloud Data Loss Prevention

In [None]:
!gcloud auth application-default login --no-launch-browser
!gcloud auth application-default set-quota-project $project_id

In [8]:
from google.cloud import dlp_v2
import google.protobuf

# Create a DLP client
dlp_client = dlp_v2.DlpServiceClient()

In [23]:
# @title DLP old code
# def deidentify_with_replace_infotype(project, item, info_types):
#     """Uses the Data Loss Prevention API to deidentify sensitive data in a
#     string by replacing it with the info type.
#     Args:
#         project: The Google Cloud project id to use as a parent resource.
#         item: The string to deidentify (will be treated as text).
#         info_types: A list of strings representing info types to look for.
#             A full list of info type categories can be fetched from the API.
#     Returns:
#         None; the response from the API is printed to the terminal.
#     """

#     # Import the client library
#     import google.cloud.dlp

#     # Instantiate a client
#     dlp = google.cloud.dlp_v2.DlpServiceClient()

#     # Convert the project id into a full resource id.
#     parent = f"projects/{project}"

#     # Construct inspect configuration dictionary
#     inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

#     # Construct deidentify configuration dictionary
#     deidentify_config = {
#         "info_type_transformations": {
#             "transformations": [
#                 {"primitive_transformation": {"replace_with_info_type_config": {}}}
#             ]
#         }
#     }

#     # Call the API
#     response = dlp.deidentify_content(
#         request={
#             "parent": parent,
#             "deidentify_config": deidentify_config,
#             "inspect_config": inspect_config,
#             "item": {"value": item},
#         }
#     )

#     # Print out the results.
#     print(response.item.value)

In [9]:
def deidentify_with_replace_infotype(project, item, info_types):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing it with the info type.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
    Returns:
        The deidentified string.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Construct inspect configuration dictionary
    inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {"primitive_transformation": {"replace_with_info_type_config": {}}}
            ]
        }
    }

    # Call the API
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": {"value": item},
        }
    )

    # Return the deidentified value
    return response.item.value

In [10]:
deidentify_with_replace_infotype(
    project=project_id,
    item="My credit phone number is +31628725569 and my name is Deyna Baeva",
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"]
)

'My credit phone number is [PHONE_NUMBER] and my name is [PERSON_NAME]'

In [11]:
whapp_partial = whapp.sample(n=17000)

In [None]:
whapp_partial

In [12]:
# Apply the DLP API to the "descr" column
whapp_partial["descr"] = whapp_partial["descr"].apply(lambda x: deidentify_with_replace_infotype(
    project=project_id,
    item=x,
    info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"]
))

In [13]:
whapp_partial["descr"].to_csv("/content/drive/MyDrive/BERTopic+embeddings/deidentified-partial-17000.csv", index=False)

In [None]:
whapp_partial["descr"]

In [28]:
# @title Async
# import asyncio

# # Split the dataframe into batches
# batch_size = 1000
# batches = [whapp["descr"][i:i+batch_size] for i in range(0, len(whapp["descr"]), batch_size)]

# async def deidentify_batch(batch):
#     # Call the async deidentify function
#     return await deidentify_with_replace_infotype(
#         project=project_id,
#         items=batch.tolist(),
#         info_types=["CREDIT_CARD_NUMBER", "EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON_NAME"]
#     )

# async def run_deidentification():
#     # Create an event loop
#     loop = asyncio.new_event_loop()

#     # Run the async deidentify function for each batch
#     tasks = [loop.create_task(deidentify_batch(batch)) for batch in batches]
#     results = await asyncio.gather(*tasks)

#     # Close the event loop
#     loop.close()

#     # Flatten the results and update the "descr" column in the DataFrame
#     deidentified_values = [value for result in results for value in result]
#     whapp["descr"] = deidentified_values

# # Run the coroutine to deidentify all batches
# asyncio.run(run_deidentification())

In [29]:
# @title Async
# import asyncio

# async def deidentify_batch(batch):
#     """Deidentifies a batch of items asynchronously using the DLP API."""
#     items = [{"value": str(item)} for item in batch]
#     parent = f"projects/{project_id}"
#     inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}
#     deidentify_config = {
#         "info_type_transformations": {
#             "transformations": [
#                 {"primitive_transformation": {"replace_with_info_type_config": {}}}
#             ]
#         }
#     }
#     response = await dlp_client.deidentify_content(
#         request={
#             "parent": parent,
#             "deidentify_config": deidentify_config,
#             "inspect_config": inspect_config,
#             "items": items,
#         }
#     )
#     return [r.item.value for r in response.items]

# def deidentify_dataframe(df, project, info_types, batch_size=100):
#     """Deidentifies a DataFrame column asynchronously using the DLP API."""
#     # Split the DataFrame into batches
#     batches = [df[i:i+batch_size]["descr"].tolist() for i in range(0, len(df), batch_size)]

#     # Process each batch asynchronously using asyncio
#     loop = asyncio.get_event_loop()
#     tasks = [loop.create_task(deidentify_batch(batch)) for batch in batches]
#     results = loop.run_until_complete(asyncio.gather(*tasks))

#     # Flatten the results and update the "descr" column in the DataFrame
#     deidentified_items = [item for sublist in results for item in sublist]
#     df["descr"] = deidentified_items
#     return df

## Data cleaning

In [72]:
whapp_data = pd.read_csv("/content/drive/MyDrive/BERTopic+embeddings/deidentified-partial-17000.csv")

In [73]:
whapp_data.head()

Unnamed: 0,descr
0,"[PERSON_NAME], zouden wij een kleine aanpassin..."
1,Waarom zijn mijn spaarpunten niet meer te zien...
2,Ik heb vrijdag een lekker soeppakket gekocht. ...
3,[PERSON_NAME] even laten weten dat de prijs va...
4,"Hoi, ik kan niet meer inloggen via [EMAIL_ADDR..."


In [74]:
import re
import string

def clean_text(text):

    # lowercasing text 
    text = text.lower()

    # remove emojis
    text = "".join(c for c in text if c not in emoji.EMOJI_DATA)

    # remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"https?\S*|www\.\S*", "", text)

    # replace order nrs with a mask
    text = re.sub(r"(?<!\d)\d{10}(?!\d)", "[ORDER_NUMBER]", text)

    # replace card nrs with a mask
    text = re.sub(r"(?<!\d)\d{13}(?!\d)", "[CARD_NUMBER]", text)
 
    # replace phone numbers with mask
    # text = re.sub(r"^\(?([+]31(\s?)|0031|0)-?6(\s?|-)([0-9]\s{0,3}){8}$", "[PHONE]", text)
    
    # replace email addresses with mask
    text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "[EMAIL_ADDRESS]", text)

    # replace promotions with mask
    text = re.sub(r"1\s*\+\s*1\s*|(one\s*)(\[PROMO\]\s*|\[PROMO\]?\s*gratis\b|\bplus\s*one\s*gratis\b)", "[PROMO]", text)

    # replace receipt nrs with a mask
    text = re.sub(r"(?<!\d)\d{20}(?!\d)", "[RECEIPT_NUMBER]", text)

    # remove numbers
    text = re.sub(r"\d+", "", text)

    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # remove extra whitespace \s+
    text = re.sub(r"\s\s+", " ", text).strip()

    return text

In [38]:
print(clean_text("this is my receipt number 12345678910123432123 and this is my email deyna@gmail.com and this is the website https://deyna.com and www.deyna.com"))

this is my receipt number RECEIPTNUMBER and this is my email EMAILADDRESS and this is the website and


In [75]:
whapp_data["clean_text"] = whapp_data["descr"].apply(clean_text)

In [76]:
# Define a dictionary mapping the transformed strings to the original special tokens
transformed_to_special_token = {
    'PERSONNAME': '[PERSON_NAME]',
    'EMAILADDRESS': '[EMAIL_ADDRESS]',
    'PHONENUMBER': '[PHONE_NUMBER]',
    'CREDITCARDNUMBER': '[CREDIT_CARD_NUMBER]',
    'IBANCODE': '[IBAN_CODE]'
}

# Define a function to replace transformed strings with special tokens
def replace_transformed_with_special_tokens(text):
    for transformed, special_token in transformed_to_special_token.items():
        text = text.replace(transformed, special_token)
    return text

# Example usage
whapp_data["clean_text"] = whapp_data["clean_text"].apply(replace_transformed_with_special_tokens)

In [77]:
whapp_data["clean_text"][5]

'goedenavond product heb ik zojuist niet ontvangen via mijn bestelling dit gaat om de spinazieblokjes van jullie eigen merk bestelling van v gelderen op de meester nj v riemsdijkstraat nr mvg desiree'

In [78]:
whapp_data["descr"][10]

'[PERSON_NAME] Ik krijg geen bevestiging van bestelling? Hallo [PERSON_NAME]! Je chat met [PERSON_NAME]. Ik zie dat je account geen bestelling staan. Heb je wel een bestelnummer doorgekregen? En met welk mailadres heb je de bestelling geplaatst? Het is al opgelost Fijn! Nog een fijne dag verder 😀'

In [79]:
whapp_data["clean_text"][10]

'personname ik krijg geen bevestiging van bestelling hallo personname je chat met personname ik zie dat je account geen bestelling staan heb je wel een bestelnummer doorgekregen en met welk mailadres heb je de bestelling geplaatst het is al opgelost fijn nog een fijne dag verder'

## Removing stop words

In [80]:
# add custom words to the stopwords list
custom_stopwords = ["email", "weekend", "wens fijne avond", "fijne avond", 
                    "hi", "hoi", "hello", "hallo", "mee", "vanochtend", "boodschapp", 
                    "goedemorgen", "goedenavond", "goedenmiddag", "dankjewel", "danke", 
                    "dankje", "dank", "bedanken", "bedankt", "je chat met", 
                    "you are chatting with", "your chat with", "bedol je", 
                    "hello this is", "you are chatting with", "je spreekt met",
                    "kl", "bz", "aan","aangaande","aangezien","achte","achter",
                    "achterna","af","afgelopen","al","aldaar","aldus","alhoewel",
                    "alias","alle","allebei","alleen","alles","als","alsnog","altijd",
                    "altoos","ander","andere","anders","anderszins","beetje","behalve",
                    "behoudens","beide","beiden","ben","beneden","bent","bepaald",
                    "betreffende","bij","bijna","bijv","binnen","binnenin","blijkbaar",
                    "blijken","boven","bovenal","bovendien","bovengenoemd","bovenstaand",
                    "bovenvermeld","buiten","bv","daar","daardoor","daarheen","daarin",
                    "daarna","daarnet","daarom","daarop","daaruit","daarvanlangs","dan",
                    "dat","de","deden","deed","der","derde","derhalve","dertig","deze",
                    "dhr","die","dikwijls","dit","doch","doe","doen","doet","door",
                    "doorgaand","drie","duizend","dus","echter","een","eens","eer",
                    "eerdat","eerder","eerlang","eerst","eerste","eigen","eigenlijk",
                    "elk","elke","en","enig","enige","enigszins","enkel","er","erdoor",
                    "erg","ergens","etc","etcetera","even","eveneens","evenwel","gauw",
                    "ge","gedurende","geen","gehad","gekund","geleden","gelijk","gemoeten",
                    "gemogen","genoeg","geweest","gewoon","gewoonweg","haar","haarzelf","had",
                    "hadden","hare","heb","hebben","hebt","hedden","heeft","heel","hem", "hoor"
                    "hemzelf","hen","het","hetzelfde","hier","hierbeneden","hierboven","hierin","hierna","hierom","hij","hijzelf","hoe","hoewel","honderd","hun","hunne","ieder","iedere","iedereen","iemand","iets","ik","ikzelf","in","inderdaad","inmiddels","intussen","inzake","is","ja","je","jezelf","jij","jijzelf","jou","jouw","jouwe","juist","jullie","kan","klaar","kon","konden","krachtens","kun","kunnen","kunt","laatst","later","liever","lijken","lijkt","maak","maakt","maakte","maakten","maar","mag","maken","me","meer","meest","meestal","men","met","mevr","mezelf","mij","mijn","mijnent","mijner","mijzelf","minder","miss","misschien","missen","mits","mocht","mochten","moest","moesten","moet","moeten","mogen","mr","mrs","mw","na","naar","nadat","nam","namelijk","nee","neem","negen","nemen","nergens","net","niemand","niet","niets","niks","noch","nochtans","nog","nogal","nooit","nu","nv","of","ofschoon","om","omdat","omhoog","omlaag","omstreeks","omtrent","omver","ondanks","onder","ondertussen","ongeveer","ons","onszelf","onze","onzeker","ooit","ook","op","opnieuw","opzij","over","overal","overeind","overige","overigens","paar","pas","per","precies","recent","redelijk","reeds","rond","rondom","samen","sedert","sinds","sindsdien","slechts","sommige","spoedig","steeds","tamelijk","te","tegen","tegenover","tenzij","terwijl","thans","tien","tiende","tijdens","tja","toch","toe","toen","toenmaals","toenmalig","tot","totdat","tussen","twee","tweede","u","uit","uitgezonderd","uw","vaak","vaakwat","van","vanaf","vandaan","vanuit","vanwege","veel","veeleer","veertig","verder","verscheidene","verschillende","vervolgens","via","vier","vierde","vijf","vijfde","vijftig","vol","volgend","volgens","voor","vooraf","vooral","vooralsnog","voorbij","voordat","voordezen","voordien","voorheen","voorop","voorts","vooruit","vrij","vroeg","waar","waarom","waarschijnlijk","wanneer","want","waren","was","wat","we","wederom","weer","weg","wegens","weinig","wel","weldra","welk","welke","werd","werden","werder","wezen","whatever","wie","wiens","wier","wij","wijzelf","wil","wilden","willen","word","worden","wordt","zal","ze","zei","zeker","zelf","zelfde","zelfs","zes","zeven","zich","zichzelf","zij","zijn","zijne","zijzelf","zo","zoals","zodat","zodra","zonder","zou","zouden","zowat","zulk","zulke","zullen","zult"]

In [81]:
def remove_stopwords(text):
  
  # load the default stopwords list from NLTK for Dutch, German and English and add custom stopwords
    stop_words = set(stopwords.words("dutch")).union(set(stopwords.words("english"))).union(custom_stopwords)
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    if len(filtered_tokens) > 1:
        masked_text = " ".join(filtered_tokens)
        masked_text = masked_text.replace("[ ", "[")
        masked_text = masked_text.replace(" ]", "]")
        return masked_text
    else:
        return None

In [82]:
# apply the function to each text in your data
whapp_data["stopw_descr"] = whapp_data["clean_text"].apply(remove_stopwords)

In [83]:
whapp_data["stopw_descr"][0]

'personname kleine aanpassing bezorgtijd zie langskomen thuis aangegeven echt goedemiddag ferry chat personname goed contact opneemt nagekeken zie bestelling uur geleverd hoop hiermee voldoende genformeerd'

### Fix the types

In [84]:
whapp_data.dtypes

descr          object
clean_text     object
stopw_descr    object
dtype: object

In [85]:
whapp_data.isnull().sum()

descr           0
clean_text      0
stopw_descr    55
dtype: int64

In [86]:
# drop the nan values
whapp_data.dropna(subset=["stopw_descr"], inplace=True)

In [87]:
whapp.dtypes

descr     object
source    object
dtype: object

In [None]:
whapp["stopw"] = whapp["stopw_descr"]

In [None]:
whapp["stopw"] = whapp["stopw"].astype("string")

In [90]:
# @title Fix the masking
# Define a dictionary mapping the transformed strings to the original special tokens
transformed_to_special_tokens = {
    'personname': '[PERSON_NAME]',
    'address': '[EMAIL_ADDRESS]',
    'phonenumber': '[PHONE_NUMBER]',
    'creditcardnumber': '[CREDIT_CARD_NUMBER]',
    'ibancode': '[IBAN_CODE]'
    # Add more mappings for other types of sensitive data as needed
}

# Define a function to replace transformed strings with special tokens
def replace_transformed_with_special_tokens(text):
    for transformed, special_token in transformed_to_special_tokens.items():
        text = text.replace(transformed, special_token)
    return text

# Example usage
whapp_data["stopw_descr"] = whapp_data["stopw_descr"].apply(replace_transformed_with_special_tokens)

In [91]:
whapp_data["descr"][0]

'[PERSON_NAME], zouden wij een kleine aanpassing in de bezorgtijd kunnen doen? Ik zie dat jullie tussen 15.40 en 16.40 langskomen, maar dan zijn wij nog niet thuis.. Wij hebben ook aangegeven tussen 16.00-19.00? 16.00 zou misschien net kunnen maar eerder echt niet Goedemiddag Ferry, je chat met [PERSON_NAME]. Goed dat je contact met ons opneemt. Ik heb het even nagekeken en ik zie dat je bestelling vanaf 4 uur word geleverd. Ik hoop je hiermee voldoende te hebben geïnformeerd.'

In [92]:
whapp_data["stopw_descr"][0]

'[PERSON_NAME] kleine aanpassing bezorgtijd zie langskomen thuis aangegeven echt goedemiddag ferry chat [PERSON_NAME] goed contact opneemt nagekeken zie bestelling uur geleverd hoop hiermee voldoende genformeerd'

## Lemmatize words

In [93]:
nlp_nl = spacy.load("nl_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

def lemmatize_text(text, lang="nl"):
    if lang == "nl":
        nlp = nlp_nl
    elif lang == "en":
        nlp = nlp_en
    else:
        raise ValueError(f"Unsupported language: {lang}")

    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]    

    return " ".join(lemmatized_tokens)

In [94]:
whapp_data["lemmatized"] = whapp_data["stopw_descr"].apply(lambda x: lemmatize_text(x))

In [95]:
whapp_data["lemmatized"]

0        [ person_name ] klein aanpassing bezorgtijd zi...
1        spaarpunt zien vergrendellen slot [ person_nam...
2        vrijdag lekker soeppakket kopen helaas tijm ho...
3        [ person_name ] laten weten prijs goudeerlijk ...
4        inlogg email [ email_address ] ww reseten [ pe...
                               ...                        
16995    jumbo bon ontvangen bedrag terugbetalen bedrag...
16996    bestelling komen speciaal tijd kiezen zaak bet...
16997    zien vanil vla merken jumbo klpen [ person_nam...
16998    [ person_name ] chat [ person_name ] waarmee h...
16999    jumbo vraagje [ person_name ] chat [ person_na...
Name: lemmatized, Length: 16945, dtype: object

In [96]:
# fix the masks after lemmatizing e.g. [ oder_number ] back to -> [ORDER_NUMBER]
def replace_masks(text):
    text = re.sub(r"\[\s*order\_number+\s*\]", "[ORDER_NUMBER]", text)
    text = re.sub(r"\[\s*card\_number+\s*\]", "[CARD_NUMBER]", text)
    text = re.sub(r"\[\s*phone\s*\]", "[PHONE]", text)
    text = re.sub(r"\[\s*email\s*\]", "[EMAIL]", text)
    text = re.sub(r"\[\s*promo\s*\]", "[PROMO]", text)
    text = re.sub(r"\[\s*receipt\_number+\s*\]", "[RECEIPT_NUMBER]", text)
    text = re.sub(r"\[\s*person\_name+\s*\]", "[PERSON_NAME]", text)
    text = re.sub(r"\[\s*email\_address+\s*\]", "[EMAIL_ADDRESS]", text)
    text = re.sub(r"\[\s*phone\_number+\s*\]", "[PHONE_NUMBER]", text)
    text = re.sub(r"\[\s*credit\_card\_number\s*\]", "[CREDIT_CARD_NUMBER]", text)
    text = re.sub(r"\[\s*iban\_code+\s*\]", "[IBAN_CODE]", text)
    
    return text

In [97]:
whapp_data["lemmatized"] = whapp_data["lemmatized"].apply(replace_masks)

In [98]:
whapp_data["lemmatized"]

0        [PERSON_NAME] klein aanpassing bezorgtijd zien...
1        spaarpunt zien vergrendellen slot [PERSON_NAME...
2        vrijdag lekker soeppakket kopen helaas tijm ho...
3        [PERSON_NAME] laten weten prijs goudeerlijk sc...
4        inlogg email [EMAIL_ADDRESS] ww reseten [PERSO...
                               ...                        
16995    jumbo bon ontvangen bedrag terugbetalen bedrag...
16996    bestelling komen speciaal tijd kiezen zaak bet...
16997    zien vanil vla merken jumbo klpen [PERSON_NAME...
16998    [PERSON_NAME] chat [PERSON_NAME] waarmee helpe...
16999    jumbo vraagje [PERSON_NAME] chat [PERSON_NAME]...
Name: lemmatized, Length: 16945, dtype: object

In [99]:
whapp_data.to_csv("/content/drive/MyDrive/BERTopic+embeddings/jumbo_lemma_17000-stopw.csv", index=False)

In [None]:
# save the lemmatized dataframe
data = whapp.to_csv("jumbo-lemmatized.csv")

In [None]:
data = pd.read_csv("/content/drive/MyDrive/datasets/jumbo-lemmatized.csv")

In [72]:
whapp_data.to_csv("/content/drive/MyDrive/BERTopic+embeddings/lemmatized_dlp.csv", index=False)