In [123]:
import re

import pandas as pd
import plotly.graph_objects as go
from nltk.corpus import stopwords

# Read messages

In [104]:
with open("/Users/eric/Downloads/_chat.txt", "r") as f:
    messages = f.readlines()[2:]

# Parse messages

In [126]:
def extract_infos_from_message(message):
    regex = re.compile(r"\[(.+)\]\s([A-Za-z\s]+)\:\s(.+)$")
    
    try:
        message_date, message_author, message_text = regex.findall(message)[0]
    except IndexError:
        message_date = pd.np.nan
        message_author = pd.np.nan
        message_text = pd.np.nan
    
    return message_date, message_author, message_text

In [143]:
stopwords_fr = stopwords.words("french")

def nlp_message(message_text):
    message_text = message_text.lower()
    message_text = re.sub(string=message_text,
                          pattern=r"([$&+,:;=?@#|\"<>.^*()%!-])",
                          repl=r" \1 ")
    
    message_text_split = message_text.split(" ")
    
    message_text_split = [word for word in message_text_split
                               if not word in stopwords_fr]
    
    output = " ".join(message_text_split).strip()
    
    return output

In [131]:
messages_parsed = list(map(extract_infos_from_message, messages))

# Compute stats

In [107]:
messages_df = pd.DataFrame(messages_parsed, columns=["timestamp", "author", "message"])
messages_df = messages_df.dropna()

In [108]:
messages_df.head()

Unnamed: 0,timestamp,author,message
0,18/09/2019 17:45:47,Eric Daoud,"Hop là du coup je crée le groupe whatsapp, c’é..."
1,18/09/2019 17:45:51,Eric Daoud,Mais c’est bien aussi
2,18/09/2019 17:46:17,Eric Daoud,Il manque Thomas Cœur et Jules (qui n’a pas wh...
5,18/09/2019 17:50:59,Ana Velasco,Trop bien! J'ai vraiment du mal avec Facebook
7,18/09/2019 17:56:40,Arnaud Mostermans,On en profite pour officiellement souhaiter un...


## Enrich messages

In [109]:
messages_df["message_len"] = messages_df["message"].apply(len)
messages_df["message_nlp"] = messages_df["message"].apply(nlp_message)

## Graphs

In [149]:
layout = go.Layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)"
)

In [150]:
# Number of messages per author
data = messages_df.groupby("author")["message"].count().sort_values(ascending=False)

fig = go.Figure(
    data=go.Bar(x=data.index,
                y=data.values),
    layout=layout
)

fig.show()

In [151]:
# Average length of messages per author
data = messages_df.groupby("author")["message_len"].mean().sort_values(ascending=False)

fig = go.Figure(
    data=go.Bar(x=data.index,
                y=data.values),
    layout=layout
)

fig.show()