# Initial Setup

In [None]:
from bs4 import Tag, NavigableString, BeautifulSoup
import os
from datetime import datetime
from natsort import os_sorted
import pandas as pd
from IPython.display import display, HTML
import matplotlib as plt
import matplotlib.dates as mdates

In [None]:
# Create a file list sorted by first to last message

folder = "c:/Users/DANKINAH/Downloads/Personal/Messages/"
#folder = "c:/Users/Daniel/Documents/Archive/Facebook Data/messages/inbox/FREENEXTIDE_EbQwaw_iLQ/"
file_list = []

for filename in os.listdir(folder):
    if filename.endswith('.html'):
       fname = os.path.join(folder, filename)
       file_list.append(fname)

file_list = os_sorted(file_list)
file_list.reverse()
print(file_list)

In [None]:
# Grab messages from files and place in a sorted list

message_list = []

for filename in file_list:
	with open(filename, encoding='utf-8') as f:
		soup = BeautifulSoup(f, 'html.parser')
	cur_message_list = []
	messages = soup.find_all(class_='pam _3-95 _2pi0 _2lej uiBoxWhite noborder')
	for message in messages[1:]:
		#Types of messages: text with embeds, one or many media items
		sender = message.contents[0].string

		content_text = ""
		content_media = {
			'images':[],
			'videos':[],
			'audio':[],
			'reactions':[]
		}

		if len(message.contents[1].contents[0]) == 4: #Regular message
			for item in message.contents[1].contents[0].contents[1].contents:
				if isinstance(item, Tag):
					if item.name == 'br':
						content_text += '\n'
					elif item.name=='a':
						content_text += item['href']
					elif item.name=='img':
						content_text += item['src']
						content_media['images'].append(item['src'])
					elif item.name=='span': #Arabic text
						content_text += item.string
					else:
						print("Unknown embedded item name: " + item.name)
						exit(1)

				elif isinstance(item, NavigableString):
					content_text += item
				else:
					print("Unknown object type in content: " + str(type(item)))		

		elif len(message.contents[1].contents[0]) > 4: #One or more media
			for item in message.contents[1].contents[0].contents:
				if len(item.contents) > 0:
					if item.find('img'):
						content_media['images'].append(item.find('img')['src'])
					elif item.find('video'):
						content_media['videos'].append(item.find('video')['src'])
					elif item.find('audio'):
						content_media['audio'].append(item.find('audio')['src'])
					elif item.find('ul'):
						for reaction in item.find('ul').contents:
							content_media['reactions'].append(reaction.string)

					elif isinstance(item.string, NavigableString): #attached message
						content_text += item.string
					elif isinstance(item, Tag): #attached message with embeds
						for atom in item.contents:
							if isinstance(atom, Tag):
								if atom.name == 'br':
									content_text += '\n'
								else:
									print("Unknown embedded item name (multi-media): " + str(atom.name))
									print(atom)
									exit(1)
							elif isinstance(atom, NavigableString):
								content_text += atom
							else:
								print("Unknown object type in content (multi-media): " + str(type(atom)))	
					else:
						print("Unknown content item: " + str(type(item)))
						exit(1)

		timestamp = datetime.strptime(str(message.contents[2].string), '%d %b %Y, %H:%M')
		
		message_dict = {
			'sender': sender, 
			'content_text':content_text, 
			'content_media':content_media, 
			'timestamp': timestamp
		}
		cur_message_list.append(message_dict)

	cur_message_list.reverse()
	message_list += cur_message_list
	print(filename, len(message_list))

# Run each of these individually

In [None]:
#Print nicknames for each member

nick = ['set the nickname for', 'set his own nickname', 'set your nickname']

nicknames = dict()

for message_dict in message_list:
	content_text = message_dict['content_text']
	if not content_text or 'nickname' not in content_text:
		continue
	if nick[0] in content_text:
		nickname = content_text.split(nick[0])[1].split(' to ')[1][:-1]
		name = content_text.split(nick[0])[1].split(' to ')[0][1:]
	elif nick[1] in content_text or nick[2] in content_text:
		nickname = content_text.split('nickname')[1].split(' to ')[1][:-1]
		name = message_dict['sender']

	if name not in nicknames:
		nicknames[name] = []
	else:
		nicknames[name].append(nickname)

print('Nicknames of each member')
for name in nicknames.keys():
    print(name + ": " + ', '.join(nicknames[name]) + "\n")

In [None]:
#Print nicknames for each member grouped by day

nick = ['set the nickname for', 'set his own nickname', 'set your nickname']

nicknames = []

for message_dict in message_list:
	content_text = message_dict['content_text']
	if not content_text or 'nickname' not in content_text:
		continue
	if nick[0] in content_text:
		nickname = content_text.split(nick[0])[1].split(' to ')[1][:-1]
		name = content_text.split(nick[0])[1].split(' to ')[0][1:]
	elif nick[1] in content_text or nick[2] in content_text:
		nickname = content_text.split('nickname')[1].split(' to ')[1][:-1]
		name = message_dict['sender']

	nicknames.append({'name': name, 'nickname': nickname, 'datetime': message_dict['timestamp']})

df = pd.DataFrame(nicknames)
df = df.groupby(df.datetime.dt.date)['nickname'].apply(', '.join).to_frame()

print('Nicknames grouped by day')
display(HTML(df.to_html()))

In [None]:
#Print amount each member has set a nickname

nick = ['set the nickname for', 'set his own nickname', 'set your nickname']

members = dict()

for message_dict in message_list:
	content_text = message_dict['content_text']
	if not content_text:
		continue
	if any([x in content_text for x in nick]):
		name = message_dict['sender']
		if name not in members:
			members[name] = 0
		else:
			members[name] += 1

print('Amount each member has set a nickname')
display(members)

In [None]:
#Print amount each member has said a certain word

word = 'friend'

members = dict()

for message_dict in message_list:
	content_text = message_dict['content_text']
	if not content_text:
		continue
	if word in content_text:
		name = message_dict['sender']
		if name not in members:
			members[name] = 0
		else:
			members[name] += 1
print('Word frequency of: ' + word)
display(members)

In [None]:
#Print amount each word has been said -- NOTWORKING

skip = ('the group.', 'the group photo.', 'set the emoji to', 'Click for video:', 'cleared the nickname for', 'cleared your nickname', 'points playing basketball.', 'Click for audio')
prepositions = ('i', 'I', 'the', 'to', 'you', 'it', 'a', 'is', 'and', 'u', 'my', "I'm", 'for', 'that', 'not', 'are', 'we', 'me', 'just', 'so', 'No', 'no', 'of', "don't", 'in', 'im', 'on')

words = dict()

for message_dict in message_list:
	content_text = message_dict['content_text']
	if not content_text:
		continue
	if any(x in content_text for x in skip):
		continue
	for word in content_text.split():
		if word not in prepositions:
			if word not in words:
				words[word] = 0
			words[word] += 1

print('Word frequencies')
display(dict(sorted(words.items(), key=lambda item: item[1])))

In [None]:
#Print message frequency over time per person

plt.rcParams["figure.figsize"] = (16,5)

df = pd.DataFrame(message_list)
# df.groupby([df.datetime.dt.year, df.datetime.dt.month])['sender'].count().plot()

members = ['Robby Royston', 'Erik Porteous', 'Nick Cornell', 'Tristan Turigan', 'Daniel Kinahan', 'Alex Staszak', 'Ben Wood', 'Caelan Rae-Oulette', 'Theo Mohamed', 'Francois Jolicoeur']
for member in members:
    temp=df[df['sender']==member]
    ax = temp.groupby(df.timestamp.dt.to_period('M'))['sender'].count().plot(x_compat=True)
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%b'))
ax.legend(members)

In [None]:
nickname_strings = ('cleared the nickname for', 'cleared your nickname', 'set the nickname for', 'set your nickname', 'set his own nickname')
group_strings = ('the group.', 'the group photo.', 'set the emoji to', 'named the group')
misc_strings = ('points playing basketball.', 'Click for audio', 'Click for video', 'Plan created:')
skip_strings = nickname_strings + group_strings + misc_strings