# Code to webscrape whatsapp HTML file

In [None]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import emoji

whatsappData = []

# Define the path to the html file 
prime = "data/2023-06-06 18-56-53/WhatsApp Chats/TCA PRIME_complete.html"
gupshup = "data/2023-06-06 18-56-53/WhatsApp Chats/TCA GUPSHUP_complete.html"
naka = "data/2023-06-06 18-56-53/WhatsApp Chats/Naka Emergency_complete.html"

## replace parameter with prime, gushup, naka AND the name of exported xlsx file
def get_chat_data(file_path, name):
    '''
    Extracts chat data from the html file and exports to an xlsx file and returns a dataframe
    file_path: path to the html file containing the chat data
    name: name of the category, also used for name of exported xlsx file
        - prime, gupshup, naka
    '''

    # using BeautifulSoup to parse the html file
    with open(file_path, 'r', encoding="utf-8") as f:
        soup = BeautifulSoup(f, 'html.parser')
        # only prime has an additional triangle-righttextgroundback
        text_classes = ["triangle-lefttextgroundback","triangle-leftImageBackground", "triangle-righttextgroundback"]
        for text_class in text_classes:
            chatBubbles = soup.find_all("div", {"class": text_class})
            for chat in chatBubbles:
                children = chat.findChildren()
                ## initialise columns
                datetime, user, text_sent, icon_sent, media_sent = '', '', '', '', ''

                ## wrap in if-else since sub_class might not be None
                if chat.find("p", {"class": "date"}):
                    datetime = chat.find("p", {"class": "date"}).text
                if chat.find("span"):
                    user = chat.find("span").text # can be phone number or name
                if chat.find("p", {"class": False}): 
                    if (text_class == "triangle-lefttextgroundback") or (text_class == "triangle-righttextgroundback"):
                        text = chat.find("p", {"class": False}).text
                        if ":" in str(text):
                            user = text.split(':')[0] # get first element before ':' as this is the contact number
                            text_sent = "".join(text.split(':')[1:])
                        else:  # this happens when text class is right text bg (sent from the right)
                            user = "owner"
                            text_sent = text
                    else: # 
                        # note that emoji.demojize does not work on None Types but if can find in html means NOT none
                        text_sent = chat.find("p", {"class": False}).text
                        if text_sent == user: # no useful info
                            text_sent = ''
                    if text_sent:
                        text_sent = emoji.demojize(text_sent)
                        text_sent = str(text_sent.encode())[2:-1]  ## <string> be decoded by using decode(b<string>) (bytes -> string)

                if chat.find("img", src=True): ## useful to get icon because some messages are empty (e.g. location)
                    block = chat.find("img", src=True)
                    icon = block['src']
                    icon_sent = re.search('(\/[^/]+){2}', icon).group(0)
                if chat.find("a", href=True):
                    block = chat.find("a", href=True)
                    media_sent = block['href']
                    
                data = {
                    "datetime": datetime,
                    "user": user,
                    "text_sent": text_sent,
                    "icon_sent": icon_sent,
                    "media_sent": media_sent
                }
                whatsappData.append(data)

    df = pd.DataFrame(data = whatsappData)

    # Clean dataframe before export
    # 1. regex for words after last '/'  to be the file name 
    # 2. change % to ' ' then remove the .jpg etc and assign to new column
    # 3. locate those ending with .opus (or .mp3)
    df[['media_src', 'media_name']] = df['media_sent'].str.rsplit('/', n = 1, expand = True)
    df[['media_name', 'media_type']] = df['media_name'].str.rsplit('.', n = 1, expand = True)
    df['media_name'] = df['media_name'].str.replace('%', ' ') # when downloaded to csv blanks become _ 
    df['media_src'] = df['media_src'].str.rsplit(pat = "/Downloads/", n=1, expand=True).drop(0, axis=1)
    df.drop(columns = ['media_sent'], inplace=True)

    clean_data_path = f"data/{name}.xlsx"
    df.to_excel(clean_data_path, index=False)
    print(f"Exported to {clean_data_path}")
    return df

df = get_chat_data(naka, "naka")

# Code to left join with transcriptions

In [None]:
# take in transcriptions df
import pandas as pd
transcriptions_df = pd.read_csv("data/transcriptions.csv")

## if there are non-unique file names, groupby 
transcriptions_clean = transcriptions_df.groupby(['file_name'], as_index=False).agg({'transcription': lambda x: '; '.join(map(str, x)), 'link_to_file': lambda x: set(x)})

# left join
transcriptions_clean.rename(columns={'file_name': 'media_name'}, inplace=True)
transcribed_df = df.merge(transcriptions_clean, on='media_name', how='left', suffixes=('',''))
transcribed_df


# Additional optional checks

In [None]:
# code to generate value counts of classes
import re
with open(prime, 'r', encoding="utf-8") as f: 
    soup = BeautifulSoup(f, 'html.parser')
    chatBubbles = soup.find_all(class_ = True) #re.compile("triangle"))
    dct = {}
    n = 0
    for chat in chatBubbles:
        if f"{chat['class']}" not in dct:
            dct[f"{chat['class']}"] = 1
        else:
            dct[f"{chat['class']}"] += 1
    print(dct)

'''
Results:
# Verified that excel file has same number of rows as number of date classes
GUSHUP: {"['content']": 1, "['date']": 10360, "['triangle-lefttextgroundback']": 2568, "['imgdic']": 2568, "['triangle-leftImageBackground']": 7791}
PRIME: {"['content']": 1, "['date']": 16826, "['triangle-leftImageBackground']": 12011, "['triangle-righttextgroundback']": 4, "['imgdic']": 4814, "['triangle-lefttextgroundback']": 4810}
NAKA: {"['content']": 1, "['date']": 207, "['triangle-leftImageBackground']": 156, "['triangle-lefttextgroundback']": 50, "['imgdic']": 50}

# Other important findings:
- messages with pdf sent will not have user
'''