In [178]:
from pathlib import Path
from loguru import logger
import pandas as pd
from datetime import datetime

Read in the file

In [179]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
processed = Path("../data/processed")
datafile = processed / config["inputpath"]
if not datafile.exists():
    logger.warning(
        f"{datafile} does not exist. Maybe first run src/preprocess.py, or check the timestamp!"
    )

In [180]:
df = pd.read_csv(datafile, parse_dates=["timestamp"])
df.head()

Unnamed: 0,timestamp,message,has_emoji,author
0,2022-02-17 13:15:00+00:00,Many wishes!!! From all of us🤗😘😘🎊,True,knee-slapping-buffalo
1,2022-02-17 13:28:00+00:00,Auguri 🎂🎂🎉🎉,True,playful-oyster
2,2022-02-17 13:35:00+00:00,Thank you guys! 🥰🥰 so many wishes!,False,incandescent-hippocampus
3,2022-02-17 13:48:00+00:00,And good luck tomorrow!!!!! 🤞🤞🤞,False,knee-slapping-buffalo
4,2022-02-17 13:54:00+00:00,Happy birthday Jiangxi Fei! 🥳🥳🥳🥳🥳🥳,False,lively-vicuña


Check the datatypes. Note the timestamp type!

In [181]:
df.dtypes

timestamp    datetime64[ns, UTC]
message                   object
has_emoji                   bool
author                    object
dtype: object

Sometimes, author names have a tilde in front of them, allong with some unicode. Let's clean that.

In [182]:
# Normalize authors consistently with the preprocessor
from importlib import reload
import re

import wa_analyzer.preprocess as _pre
reload(_pre)
from wa_analyzer.preprocess import WhatsappPreprocessor

# Safe fallback if running in an older environment without the new method
if hasattr(WhatsappPreprocessor, "normalize_author"):
    normalizer = WhatsappPreprocessor.normalize_author
else:
    # Fallback: basic cleanup (tilde/NNBSP + collapse whitespace)
    def normalizer(name: str) -> str:
        if not isinstance(name, str):
            return name
        s = re.sub(r"[\u200B-\u200D\uFEFF]", "", name)  # zero-width
        s = s.replace("\u00A0", " ").replace("\u202F", " ")  # NBSP/NNBSP
        s = re.sub(r"^[~\u223C]\s*", "", s)  # leading tilde/wave
        s = re.sub(r"\s+", " ", s).strip()
        return s

# Apply normalization
df["author"] = df["author"].apply(normalizer)
df["author"].head()

0       knee-slapping-buffalo
1              playful-oyster
2    incandescent-hippocampus
3       knee-slapping-buffalo
4               lively-vicuña
Name: author, dtype: object

Let's check how many unique authors we have

In [183]:
len(df.author.unique())

7

Let's make the authors anonymous

In [184]:
# Preserve original author names before anonymization
if "author_orig" not in df.columns:
    df["author_orig"] = df["author"]
df["author_orig"].head()

0       knee-slapping-buffalo
1              playful-oyster
2    incandescent-hippocampus
3       knee-slapping-buffalo
4               lively-vicuña
Name: author_orig, dtype: object

In [None]:
import json
import pandas as pd
from wa_analyzer.humanhasher import humanize
from wa_analyzer.preprocess import WhatsappPreprocessor

reference_file = processed / "anon_reference.json"

# Prefer loading the reference generated by the preprocessor
if reference_file.exists():
    with open(reference_file, "r", encoding="utf-8") as f:
        ref = json.load(f)  # anonymized -> original
    # Build original -> anonymized for mapping in this notebook
    anon = {v: k for k, v in ref.items()}
else:
    # First-time setup (no reference yet): build from original names only
    src_col = "author_orig" if "author_orig" in df.columns else None
    if src_col is None:
        raise FileNotFoundError(
            "anon_reference.json not found and df has no 'author_orig'.\n"
            "Run the preprocessor (src/wa_analyzer/preprocess.py) to generate a proper reference,"
            " or re-run earlier cells to create 'author_orig' before anonymization."
        )
    # Optional normalization for consistency
    if hasattr(WhatsappPreprocessor, "normalize_author"):
        df[src_col] = df[src_col].apply(WhatsappPreprocessor.normalize_author)
    authors = df[src_col].unique()
    anon = {k: humanize(k) for k in authors}  # original -> anonymized
    # Save one-time reference anonymized -> original
    with open(reference_file, "w", encoding="utf-8") as f:
        ref = {v: k for k, v in anon.items()}
        json.dump({k: ref[k] for k in sorted(ref.keys())}, f, ensure_ascii=False, indent=2)

# Show a small preview
list(anon.items())[:5]

In [186]:
# Apply anonymized labels to a working author column used downstream
df["anon_author"] = df[("author_orig" if "author_orig" in df.columns else "author")].map(anon)
df.head()

Unnamed: 0,timestamp,message,has_emoji,author,author_orig,anon_author
0,2022-02-17 13:15:00+00:00,Many wishes!!! From all of us🤗😘😘🎊,True,knee-slapping-buffalo,knee-slapping-buffalo,hypnotic-worm
1,2022-02-17 13:28:00+00:00,Auguri 🎂🎂🎉🎉,True,playful-oyster,playful-oyster,swift-porpoise
2,2022-02-17 13:35:00+00:00,Thank you guys! 🥰🥰 so many wishes!,False,incandescent-hippocampus,incandescent-hippocampus,quirky-ermine
3,2022-02-17 13:48:00+00:00,And good luck tomorrow!!!!! 🤞🤞🤞,False,knee-slapping-buffalo,knee-slapping-buffalo,hypnotic-worm
4,2022-02-17 13:54:00+00:00,Happy birthday Jiangxi Fei! 🥳🥳🥳🥳🥳🥳,False,lively-vicuña,lively-vicuña,hooting-swallow


We can now drop the original author column

In [187]:
# Keep original author for reference, work with anon_author downstream
df.drop(columns=["author"], inplace=True, errors="ignore")

Check if it's gone

In [188]:
df.head()

Unnamed: 0,timestamp,message,has_emoji,author_orig,anon_author
0,2022-02-17 13:15:00+00:00,Many wishes!!! From all of us🤗😘😘🎊,True,knee-slapping-buffalo,hypnotic-worm
1,2022-02-17 13:28:00+00:00,Auguri 🎂🎂🎉🎉,True,playful-oyster,swift-porpoise
2,2022-02-17 13:35:00+00:00,Thank you guys! 🥰🥰 so many wishes!,False,incandescent-hippocampus,quirky-ermine
3,2022-02-17 13:48:00+00:00,And good luck tomorrow!!!!! 🤞🤞🤞,False,knee-slapping-buffalo,hypnotic-worm
4,2022-02-17 13:54:00+00:00,Happy birthday Jiangxi Fei! 🥳🥳🥳🥳🥳🥳,False,lively-vicuña,hooting-swallow


And let's rename the column

In [189]:
# Rename anon_author -> author for analyses, preserve author_orig for lookup
df.rename(columns={"anon_author": "author"}, inplace=True)
df.head()

Unnamed: 0,timestamp,message,has_emoji,author_orig,author
0,2022-02-17 13:15:00+00:00,Many wishes!!! From all of us🤗😘😘🎊,True,knee-slapping-buffalo,hypnotic-worm
1,2022-02-17 13:28:00+00:00,Auguri 🎂🎂🎉🎉,True,playful-oyster,swift-porpoise
2,2022-02-17 13:35:00+00:00,Thank you guys! 🥰🥰 so many wishes!,False,incandescent-hippocampus,quirky-ermine
3,2022-02-17 13:48:00+00:00,And good luck tomorrow!!!!! 🤞🤞🤞,False,knee-slapping-buffalo,hypnotic-worm
4,2022-02-17 13:54:00+00:00,Happy birthday Jiangxi Fei! 🥳🥳🥳🥳🥳🥳,False,lively-vicuña,hooting-swallow


In [190]:
df.head()

Unnamed: 0,timestamp,message,has_emoji,author_orig,author
0,2022-02-17 13:15:00+00:00,Many wishes!!! From all of us🤗😘😘🎊,True,knee-slapping-buffalo,hypnotic-worm
1,2022-02-17 13:28:00+00:00,Auguri 🎂🎂🎉🎉,True,playful-oyster,swift-porpoise
2,2022-02-17 13:35:00+00:00,Thank you guys! 🥰🥰 so many wishes!,False,incandescent-hippocampus,quirky-ermine
3,2022-02-17 13:48:00+00:00,And good luck tomorrow!!!!! 🤞🤞🤞,False,knee-slapping-buffalo,hypnotic-worm
4,2022-02-17 13:54:00+00:00,Happy birthday Jiangxi Fei! 🥳🥳🥳🥳🥳🥳,False,lively-vicuña,hooting-swallow


In my case, the first line is a header, saying messages are encrypted. Let's remove that. Your data might be different, so double check if you also want to remove the first line!

In [191]:
df = df.drop(index=[0])

let's check:

In [192]:
df.head()

Unnamed: 0,timestamp,message,has_emoji,author_orig,author
1,2022-02-17 13:28:00+00:00,Auguri 🎂🎂🎉🎉,True,playful-oyster,swift-porpoise
2,2022-02-17 13:35:00+00:00,Thank you guys! 🥰🥰 so many wishes!,False,incandescent-hippocampus,quirky-ermine
3,2022-02-17 13:48:00+00:00,And good luck tomorrow!!!!! 🤞🤞🤞,False,knee-slapping-buffalo,hypnotic-worm
4,2022-02-17 13:54:00+00:00,Happy birthday Jiangxi Fei! 🥳🥳🥳🥳🥳🥳,False,lively-vicuña,hooting-swallow
5,2022-02-17 13:54:00+00:00,<Media omitted>,False,incandescent-hippocampus,quirky-ermine


Let's find emojis in the text and add that as a feature.

In [193]:
emoji_pattern = re.compile(
    "["
    "\U0001f600-\U0001f64f"  # emoticons
    "\U0001f300-\U0001f5ff"  # symbols & pictographs
    "\U0001f680-\U0001f6ff"  # transport & map symbols
    "\U0001f1e0-\U0001f1ff"  # flags (iOS)
    "\U00002702-\U000027b0"  # Dingbats
    "\U000024c2-\U0001f251"
    "]+",
    flags=re.UNICODE,
)


def has_emoji(text):
    if not isinstance(text, str):  # Check if the input is a string resolved error
        return False
    return bool(emoji_pattern.search(text)) 


df["has_emoji"] = df["message"].apply(has_emoji)

Let's create a timestamp for a new, unique, filename.

In [194]:
import pytz

now = datetime.now(tz=pytz.timezone('Europe/Amsterdam')).strftime("%Y%m%d-%H%M%S")
now

'20250914-210507'

In [195]:
output = processed / f"whatsapp-{now}.csv"
output

PosixPath('../data/processed/whatsapp-20250914-210507.csv')

Let's save the file both as a csv and as a parquet file.
Parquet has some advantages:
- its about 100x faster to read and write
- datatypes are preserved (eg the timestamp type). You will loose this in a csv file.
- file size is much smaller

The advantage of csv is that you can easily peak at the data in a text editor.

In [196]:
df.to_csv(output, index=False)
df.to_parquet(output.with_suffix(".parq"), index=False)

Now, go to `config.toml` and change the name by "current" to the parquet file you just created.
This makes it easier to use the same file everywhere, without the need to continuously retype the name if you change it.