# Jupyter Notebook zur Umwandlung von WhatsApp Exportdateien nach xlsx und XML

Autor: Simon Meier-Vieracker (https://github.com/fussballlinguist/)

In [None]:
import re
import xml.etree.ElementTree as et
from xml.dom import minidom
import pandas as pd

In [None]:
with open("whatsapp.txt") as f:
    chat = f.read()

In [None]:
chat = re.sub(r'\n(?=[^\[])',' ',chat)
lines = chat.split("\n")
filtered_lines = [line for line in lines if '\u200E' not in line]

In [None]:
def parse_whatsapp_line(line):
    pattern = r'\[(\d{2})\.(\d{2})\.(\d{2}), (\d{2}:\d{2}:\d{2})\] (.*?): (.*)'
    match = re.match(pattern, line)
    
    if match:
        day, month, year, time, author, message = match.groups()
        date = f"{year}-{month}-{day}T{time}"
        return date, author, message

In [None]:
corpus = et.Element("corpus")

rows = []
cols = ["date","author","message"]

for line in filtered_lines:
    date, author, message = parse_whatsapp_line(line)
    message = et.SubElement(corpus, "message", {"date":date, "author":author})
    message.text = message

    rows.append({"date":date,
                 "author":author,
                 "message":message})

In [None]:
df = pd.DataFrame(rows, columns=cols)
df.to_excel("whatsapp.xlsx")

In [None]:
xmlstr = minidom.parseString(et.tostring(corpus)).toprettyxml(indent="  ")
print(xmlstr)
with open("whatsapp.xml", "w") as xmlout:
    xmlout.write(xmlstr)