In [1]:
# ********** User Inputs ****************************************************************** #
file_name = 'BenKnight/message.json'
timezone = 'America/New_York' # set local timezone
anniversary_date = '2017-07-12'

# Import Statements

In [2]:
# ********** Import Statements ************************************************************ #
import pandas as pd
import numpy as np
import json
import pytz
import re

# Program Sketch

**Goal**: To gather inferential statistics from Facebook messenger data in a systematic manner, like sentiment analysis, occurrence of specific words, length / quantity / complexity of conversations / messages / words per user, extraction of a personal ‘lexicon’, and potential utilization of machine learning to craft an “AI-built conversation” between the two parties.

**Super High Level**: JSON goes in, neat stuff gets found, data comes out.

**High Level Task List**:
1. Pull messenger data from Facebook and convert into a DataFrame.
2. Clean up the DataFrame so it can be interacted with easily.
3. Parse DataFrame to show interesting inferences, like:
    * frequency of words used over time.
    * personal vocabulary between two people (relationship lingua Franca #thanksnick)
    * what does a 'conversation' look like?
    * artificial AI-generated "conversation" between both people.
4. Visualize the interesting data in pixel/scarf-esque patterns.
5. Export the data in ~some format~ so it can feed into KnitForKnat.

---
**Functions Needed**:
1. Parsing Data
    * Setting the local timezone
    * Pulling the names
2. Splicing Data Sets
    * By user  (separate one person from another, perhaps group chats in future, use \*args)
    * By time  (hour, day, week, month, year for analysis, comparison)
    * By value (...)
3. Counts/Calculations (combined + per person):
    * How many words / messages / conversations were there?
    * How many times did person X say word W? (count)
    * How often did person X say word W compared to person Y? (ratio)
    * How often did person X say word W in respect to the words they say? (respective ratio)


# Functions

In [3]:
# ********** Functions ******************************************************************** #
# Convert file name of FB Conversation json to a DataFrame.
def convert_filename_to_df(file_name):
    # open the data
    with open(file_name) as f: #
        data = json.load(f)
        
    # pull out messages for dataframe conversion
    messages = [x for x in data['messages']]
    
    df = pd.DataFrame(messages)
    return df

In [4]:
# Set to local timezone of user
def utc_to_local(utc_dt):
    local_tz = pytz.timezone(timezone)
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
    return local_tz.normalize(local_dt)

In [5]:
# 0 means 'before start of dating'
# 1 means 'first dating year'
# 2 means 'second dat.......

def determine_dating_year(date_col):
    # get the date we want, add 4 bc it's being weird
    dateyear_start = utc_to_local(pd.to_datetime(anniversary_date) + pd.DateOffset(hours=4))
    
    # subtract single day from all dates
    new_col = date_col - dateyear_start
    
    # convert all Timedeltas to day precision and only pull the number of days
    new_col2 = new_col.apply(lambda x: x.astype('timedelta64[D]').item().days) 
    
    return (new_col2//365)+1

In [6]:
# Clean up the dataframe in order to parse going forward
def clean_dataframe(df, year_start='default'):
    # sort by oldest message
    df.sort('timestamp_ms', inplace=True)

    # set all content to lowercase for easier parsing going forward
    df['content'] = df['content'].str.lower()
    df['content'].fillna(value='', inplace=True)
    
    # convert unixtime to datetime and map to local timezone!
    df['date'] = pd.to_datetime(df['timestamp_ms'], unit='ms')
    df['date'] = map(utc_to_local, df['date'])
    
    # create a column that pulls out the year
    if year_start == 'default':
        df['date_year'] = df['date'].apply(lambda x: x.year)
    elif year_start == 'anniversary':
        df['date_year'] = determine_dating_year(df['date'])
    
    df['word_count'] = df['content'].apply(lambda x: len(re.findall(r'\w+', x)))
    return df

In [7]:
def person(name):
    # this assumes you've already created chat_df
    unique_names = chat_df['sender_name'].unique()
    if name in unique_names:
        return chat_df[chat_df['sender_name'] == name]
    else:
        print name, 'isn\'t in the DataFrame. \nTry one of these:', ', '.join(unique_names.astype(str))

# Parsing the Messenger Data

In [8]:
# Let's parse our data.
chat_df_raw = convert_filename_to_df(file_name)
chat_df = clean_dataframe(chat_df_raw, year_start='anniversary')

# Grouping By

In [11]:
chat_df.to_csv('BenKnightConvo.csv', encoding='utf-8')

In [27]:
# Great. Now we have a working DataFrame.
# What's the date range we're working with?
print chat_df['date'].min().date(), 'to', chat_df['date'].max().date()

2017-07-06 to 2018-07-30


In [68]:
s = {x:'count' for x in chat_df.columns-['word_count']}
s['word_count'] = 'sum'

s

{u'audio_files': 'count',
 u'call_duration': 'count',
 u'content': 'count',
 'date': 'count',
 'date_year': 'count',
 u'gifs': 'count',
 u'missed': 'count',
 u'photos': 'count',
 u'reactions': 'count',
 u'sender_name': 'count',
 u'share': 'count',
 u'sticker': 'count',
 u'timestamp_ms': 'count',
 u'type': 'count',
 u'videos': 'count',
 'word_count': 'sum'}

In [71]:
# Grouping by dating year AND name.
chat_df.groupby(['date_year', 'sender_name']).agg(s).sort_index(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,audio_files,call_duration,content,date,date_year,gifs,missed,photos,reactions,sender_name,share,sticker,timestamp_ms,type,videos,word_count
date_year,sender_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,Ben Knight,0,0,219,219,219,0,0,0,0,219,0,0,219,219,0,3244
0,Cassie Beth,0,0,361,361,361,0,0,8,0,361,5,2,361,361,0,6069
1,Ben Knight,0,4,13621,13621,13621,139,4,231,91,13621,115,141,13621,13621,5,109548
1,Cassie Beth,6,13,26158,26158,26158,140,9,1714,11,26158,268,157,26158,26158,106,228105
2,Ben Knight,0,0,949,949,949,2,0,35,11,949,17,16,949,949,0,6972
2,Cassie Beth,5,1,1819,1819,1819,8,0,153,1,1819,17,14,1819,1819,7,13147


In [58]:
# Grouping by dating year AND name.
chat_df.groupby(['date_year', 'sender_name']).agg(s).sort_index(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,audio_files,call_duration,content,date,date_year,gifs,missed,photos,reactions,sender_name,share,sticker,timestamp_ms,type,videos,word_count
date_year,sender_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,Ben Knight,0,0,219,219,219,0,0,0,0,219,0,0,219,219,0,3244
0,Cassie Beth,0,0,361,361,361,0,0,8,0,361,5,2,361,361,0,6069
1,Ben Knight,0,4,13621,13621,13621,139,4,231,91,13621,115,141,13621,13621,5,109548
1,Cassie Beth,6,13,26158,26158,26158,140,9,1714,11,26158,268,157,26158,26158,106,228105
2,Ben Knight,0,0,949,949,949,2,0,35,11,949,17,16,949,949,0,6972
2,Cassie Beth,5,1,1819,1819,1819,8,0,153,1,1819,17,14,1819,1819,7,13147


In [59]:
# Grouping by name
chat_df.groupby(['sender_name']).agg(s).sort_index(axis=1)

Unnamed: 0_level_0,audio_files,call_duration,content,date,date_year,gifs,missed,photos,reactions,sender_name,share,sticker,timestamp_ms,type,videos,word_count
sender_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Ben Knight,0,4,14789,14789,14789,141,4,266,102,14789,132,157,14789,14789,5,119764
Cassie Beth,11,14,28338,28338,28338,148,9,1875,12,28338,290,173,28338,28338,113,247321


In [60]:
# Grouping by dating year
chat_df.groupby(['date_year']).agg(s).sort_index(axis=1)

Unnamed: 0_level_0,audio_files,call_duration,content,date,date_year,gifs,missed,photos,reactions,sender_name,share,sticker,timestamp_ms,type,videos,word_count
date_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,0,580,580,580,0,0,8,0,580,5,2,580,580,0,9313
1,6,17,39779,39779,39779,279,13,1945,102,39779,383,298,39779,39779,111,337653
2,5,1,2768,2768,2768,10,0,188,12,2768,34,30,2768,2768,7,20119


In [None]:
"/Users/clebauer/Downloads/facebook-cassielebauer/messages/inbox"