In [None]:
import os
import re
import pandas as pd
import itertools
from datetime import datetime
import time
from matplotlib import pyplot as plt

In [None]:
def raw_to_df(file):
    # file = directory of file
    # if the file is in the same folder as the jupyter notebook, only filename required
    f = open(file, 'r', encoding='utf-8')
    chat = pd.DataFrame(columns=['date','time','sender','message'])
    beginning_pattern = "^\d+/\d+/\d+, \d\d:\d\d - .*:"
    misc_pattern="(^\d+/\d+/\d+, \d\d:\d\d - .* removed|^\d+/\d+/\d+, \d\d:\d\d - .* added|^\d+/\d+/\d+, \d\d:\d\d - .* created group|^\d+/\d+/\d+, \d\d:\d\d - .* joined using|^\d+/\d+/\d+, \d\d:\d\d - .* left|^\d+/\d+/\d+, \d\d:\d\d - .* changed the)"
    whatsapp_pattern = "^\d+/\d+/\d+, \d\d:\d\d - Messages and calls are end-to-end encrypted"
    date = datetime.now()
    time = datetime.now()
    sender = ""
    msg = ""
    for count, line in enumerate(f):
        # if the line begins with the signature of a message (datetime - sender:)
        if re.match(beginning_pattern, line):
            if msg:
                # record the [old] accumulated data in the dataframe
                chat = chat.append(pd.Series({'date': date, 'time': time, 'sender': sender, 'message':msg}), ignore_index=True)
            # extract the new line's information
            date, time, sender, msg = extract_msg(line, miscEvent=False)
        elif re.match(misc_pattern, line):
            if msg:
                # record the [old] accumulated data in the dataframe
                chat = chat.append(pd.Series({'date': date, 'time': time, 'sender': sender, 'message':msg}), ignore_index=True)
            # extract the new line's information
            date, time, sender, msg = extract_msg(line, miscEvent=True)
        elif re.match(whatsapp_pattern, line):
            if msg:
                # record the [old] accumulated data in the dataframe
                chat = chat.append(pd.Series({'date': date, 'time': time, 'sender': sender, 'message':msg}), ignore_index=True)
            # extract the new line's information
            date, time, sender, msg = extract_msg(line, miscEvent=True, WhatsAppEvent=True)
        else: 
            # if the line doesn't begin with the signature of a message 
            # that means it's a continuation of a previous message
            # append it to the [old] msg variable
            msg = msg + line
    # record the [leftover] accumulated data in the dataframe
    chat = chat.append(pd.Series({'date': date, 'time': time, 'sender': sender, 'message':msg}), ignore_index=True)
    
    return chat
    
def extract_msg(line, debug=False, miscEvent=False, WhatsAppEvent=False): 
    # datetime format is "month/day/year, hh:mm"
    datetime_pattern  = "\d+/\d+/\d+, \d\d:\d\d" 
    
    # get datetime string using datetime pattern
    datetime_str = re.match(datetime_pattern, line).group(0)
    # convert datetime string to datetime object
    dtime = datetime.now()
    
    # infer the datetime format (european - dd/mm/yyyy or american - mm/dd/yy)
    if re.match("\d+/\d+/\d\d\d\d", line):
        dtime = datetime.strptime(datetime_str, '%d/%m/%Y, %H:%M')
    else:
        dtime = datetime.strptime(datetime_str, '%m/%d/%y, %H:%M')
    
        
    # remove datetime string from message
    nodatetime = line[len(datetime_str):len(line)]
    
    if WhatsAppEvent:
        msg = nodatetime[3:len(nodatetime)]
        sender = 'WhatsApp Misc'
        return dtime.date(), dtime.time(), sender, msg
    
    else:
        # sender format is nongreedy " - sender_id: "
        # where sender_id is unknown length of any characters that 
        # could be either phone number or string nongreedy (?) = find 
        # shortest amount of characters before the colon
        author_pattern=""
        
        if miscEvent:
            author_pattern = "( - .*? crea| - .*? join| - .*? left| - .*? chan| - .*? remo| - .*? adde)" 
        else:
            author_pattern = " - .*?: " 
        
        # get author string using author pattern
        author = re.match(author_pattern, nodatetime).group(0)

        # get pure message without no metadata
        if miscEvent:
            msg = nodatetime[len(author)-4:len(nodatetime)]
        else:
            msg = nodatetime[len(author):len(nodatetime)]
            
        

        # remove the hyphen in the beginning and the colon/event at the end of the sender
        if miscEvent:
            sender = author[3:len(author)-5]
        else: 
            sender = author[3:len(author)-2] 
        
        return dtime.date(), dtime.time(), sender, msg

In [None]:
# convert the raw txt file to a pandas dataframe + track how long it takes
start_time = time.time()
chat = raw_to_df("chat.txt")
print("it took %.3f seconds to process %d messages" % ((time.time() - start_time), chat.shape[0]))

In [None]:
# Export to excel and csv
chat.to_excel("chat.xlsx")
chat.to_csv("chat.csv")

In [None]:
# usually both of these are only one or two messages
idxWAM= chat[(chat["sender"] == 'WhatsApp Misc' )].index  
idxYou = chat[(chat["sender"] == 'You' )].index 
chat.drop(idxWAM, inplace=True)
chat.drop(idxYou, inplace=True)

In [None]:
# Bar plot of senders
chat.sender.value_counts().sort_values().plot(kind = 'barh', figsize=(5,5))
plt.xlabel("Number of Messages")
plt.ylabel("Sender")
plt.grid()
plt.show()

In [None]:
# Pie plot of senders
chat.sender.value_counts().sort_values().plot(kind = 'pie', autopct='%1.1f%%', figsize=(10,10), colormap="rainbow")
plt.ylabel("")
plt.xlabel("")
plt.legend(title="Senders", loc='center left', bbox_to_anchor=(1.1, 0.5))
plt.show()