In [None]:
import json
import os
import datetime
import csv
import sys
import re
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tkinter import *
from pandas.plotting import register_matplotlib_converters
from contact import Contact #importing Contact class from contact.py
import string
import math
import seaborn as sns
import numpy as np
import spacy

In [None]:
directory = "messages/inbox/" # path to inbox folder of messenger archive data
filenames = os.listdir(directory)

In [None]:
# Arrays that hold all contact data
name_list=[]
filepath = "C:/Users/Gowri/Documents/messenger_analysis/messages/inbox/"

c_num = -1
# Loop through all folders
for filename in filenames:
    # Name of file is "message_1.json"
    with open(filepath + filename + "/message_1.json") as f:
      data = json.load(f)

    # Only analyze direct messages and if more than 100 messages sent
    if data["thread_type"] == "Regular" and len(data["messages"]) > 100:
        # 'title' is the name of contact
        contact_name = data["title"]
        name_list.append(Contact(contact_name, filename))
        c_num = c_num+ 1
        my_contact=name_list[c_num]
        print(my_contact.name)

        for i in data["messages"]:
            # Check if content exists (not vid or photo or shared links)
            # Note that some shared links include message
            if i.get("content") and i["type"] == 'Generic':
                # Count number of words
                num_words = 0
                message_text = i["content"] # Message 
                words = message_text.split() # Each "word" in array
                for substr in words:
                    if re.search("[a-zA-Z0-9]", substr): # Count as "word" if contains letter or number
                        num_words += 1 

                sender = i["sender_name"]
                ts= i["timestamp_ms"]
                dt_obj = datetime.datetime.fromtimestamp(int(ts/1000))

                if sender == my_contact.name:
                    my_contact.add_rcvd_msg_date(dt_obj, num_words)

                else:
                    my_contact.add_sent_msg_date(dt_obj, num_words)


In [None]:
# Write data to a CSV
with open('MessageData'+'.csv','w', newline='', encoding='utf-8-sig') as csvFile:
    writer = csv.writer(csvFile)
    hdr = ["Name","Year","Month","Received Messages", "Sent Messages","Received Words","Sent Words"]
    writer.writerow(hdr)
    for name in name_list:
        msg_data = name.get_dates()
        msg_counts = name.get_counts()
        for entry in msg_data:
            pos = msg_data.index(entry)
            row = [name.name,entry.year,entry.month, msg_counts[pos][0], msg_counts[pos][1],msg_counts[pos][2],msg_counts[pos][3]]
            writer.writerow(row)

In [None]:
with open('Contact_Folders'+'.csv','w', newline='', encoding='utf-8-sig') as refFile:
    writer = csv.writer(refFile)
    hdr = ["Name","Folder"]
    writer.writerow(hdr)
    for name in name_list:
        row = [name.name, name.folder_name]
        writer.writerow(row)

In [None]:
# Load data to DataFrame for further analysis
df = pd.read_csv('MessageData.csv')
# Create date column
df['Date'] = pd.to_datetime(df[['Year', 'Month']].assign(DAY=1))
df['Total Messages'] = df['Received Messages'] + df['Sent Messages']
df['Total Words'] = df['Received Words'] + df['Sent Words']
folders = pd.read_csv('Contact_Folders.csv', index_col = 'Name')

In [None]:

# Plot messages over time for a particular person

# Get contact name
name_input = 'Bijily Jayaprakash' #Input name of your choice
# Only plot if valid name
if name_input in df['Name'].values:
    # Filter
    filtered_df = df[df['Name'] == name_input]
    # Fill missing months with 0
    filtered_df = filtered_df.set_index('Date').resample('MS').asfreq(fill_value = 0)
    plt.figure()
    plt.plot(filtered_df.index, filtered_df['Received Messages'], label = "Received")
    plt.plot(filtered_df.index, filtered_df['Sent Messages'], label = "Sent")
    plt.xticks(rotation=30)
    plt.legend()
    plt.title(name_input)
    plt.show()

In [None]:
# Top 10 messaged people

n = 10
fig = plt.figure(figsize = [10, 5])
ax = fig.add_axes([0.1,0.2,0.85,0.7])
# Group by name, filter top 10 and sort, plot Name vs. messages
print(df.head())
grouped_contacts = df.groupby(df['Name'])
print(grouped_contacts.head())
grouped_contacts = grouped_contacts['Name', 'Total Messages', 'Total Words'].sum()
print(grouped_contacts.head())

grouped_contacts.sort_values(by = ['Total Messages'], ascending = [False], inplace=True)
if len(grouped_contacts.index) > n:
    grouped_contacts = grouped_contacts[0:n-1]
ax.bar(grouped_contacts.index, grouped_contacts['Total Messages'])
ax.set_ylabel("Number of Messages")
ax.set_title("Top " + str(n) + " Most Messaged")
plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
plt.show()

In [None]:
# Plotting messages sent and received over time

plt.figure()
filtered_df = df.groupby(df['Date'])
filtered_df = filtered_df.sum()
plt.plot(filtered_df.index, filtered_df['Received Messages'], label = "Received")
plt.plot(filtered_df.index, filtered_df['Sent Messages'], label = "Sent")
plt.xticks(rotation=30)
plt.legend()
plt.show()

In [None]:
# Word spectrum for a particular person

nlp = spacy.load('en_core_web_sm')
name_input = 'Bijily Jayaprakash'
folder_name = 'C:/Users/Gowri/Documents/messenger_analysis/messages/inbox/BijilyJayaprakash_lMvE1KhgNg' #folder name of the person chosen
with open(folder_name + "/message_1.json") as f:  #the message_1.json file contains all the texts sent and received 
    data = json.load(f)

word_counts = {} # to store words and count of the words
my_count = 0  #total no: of words sent by me
friend_count = 0  #total no: of words sent to me by friend

In [None]:
# Only analyze direct messages and if more than 100 messages sent
for i in data["messages"]:
    # Check if content exists (not vid or photo or shared links). Note that some shared links include message
    if i.get("content") and i["type"] == 'Generic':
        # Count number of words
        message_text = i["content"] # Message
        message_doc = nlp(message_text)
        sender = i["sender_name"]
        # Remove all punctuation
        for token in message_doc:
            word = str(token.lemma_).lower()
            # Ignore stopwords and punctuations and pronouns
            if (not token.is_stop) and (not token.is_punct):
                # Left is me, right is converser
                if sender == name_input:
                    friend_count += 1
                    if word in word_counts.keys():
                        word_counts.update({word:[word_counts[word][0], word_counts[word][1] + 1]})
                    else:
                        word_counts[word] = [0,1] #this means I used the word 0 times and sender used it one time(here adding word to dictionary for first time)
                else:
                    my_count += 1
                    if word in word_counts.keys():
                        word_counts.update({word:[word_counts[word][0]+1, word_counts[word][1]]})
                    else:
                        word_counts[word] = [1,0]

In [None]:
words_data = pd.DataFrame.from_dict(word_counts,orient='index') #creating pandas df from dictionary of words and its count
words_data = words_data.reset_index()
words_data.columns = ['word','me', 'friend']

words_data = words_data[words_data['word'].str.isalpha()] #to remove the utf-8 encoded emojis in the messages json data
words_data['my_norm'] = words_data['me']/my_count*1000
words_data['friend_norm'] = words_data['friend']/friend_count*1000
words_data['my_prop'] = words_data['my_norm']/(words_data['my_norm']+words_data['friend_norm'])
words_data['prop_bin'] = np.floor(words_data['my_prop']*10)

# Filter
words_data = words_data[(words_data['friend_norm'] > 1) | (words_data['my_norm'] > 1)]
words_data['total'] = words_data['me']+words_data['friend']
words_data.sort_values(by = ['total'], ascending = [False], inplace=True)


In [None]:
words_data.head()

In [None]:
# Visualising word spectrum

summ = pd.DataFrame()
for i in range(10):
    summ[str(i)] = words_data[words_data['prop_bin']==i]['word'].reset_index().head(10)['word']

bins = []
for i in range(10):
    bins.append(','.join(word for word in words_data[words_data['prop_bin']==i]['word'].reset_index().head(5)['word'].tolist()))
    
summ_df = pd.DataFrame(bins, columns = ['Words'])
summ_df['Val'] = pd.Series([i for i in range(10)])
summ_df.set_index('Words', inplace = True)
#matplotlib.rcParams['font.sans-serif'] = ['Source Han Sans TW', 'sans-serif']
fig = plt.figure(figsize = (13,5))
ax = fig.add_axes([0.22	,0.1,0.85,0.7])
sns.heatmap(summ_df)

plt.show()