# Import Statements

In [None]:
import pandas as pd
import json
from collections import Counter
import string
import os

# Program Sketch

**Goal**: To determine the unique set of words that two given people use together (that a given person rarely uses with other people).

**Super High Level**: Find all conversations, scrape JSON for base-case words, scrape rest of JSONs to determine core set of words, pull words on end of frequency chart (1? 2? s.d.)

**High Level Task List**:
1. Pull messenger data for a given person and convert into a DataFrame.
2. Clean up the DataFrame so it can be interacted with easily.
3. Parse words in DataFrame for the user to create initial data set of words.
4. Repeat this process for 10 other people, to start.
5. Determine which words show up in 10/10, 9/10, etc cases.
6. Test this on 10 other people, 100 other people, all cases (if time allows)

# Functions

In [None]:
def clean_dataframe(df):
    # TODO: sometimes a message dataframe doesn't have a 'content' column.
    
    # sort by oldest message
    df.sort('timestamp_ms', inplace=True)
    
    # set all content to lowercase for easier parsing going forward
    df['content'] = df['content'].str.lower()
    df['content'].fillna(value='', inplace=True)
    
    df['date'] = pd.to_datetime(df['timestamp_ms'], unit='ms')
    return df

In [None]:
def convert_filename_to_df(file_name):
    # open the data
    with open(file_name) as f: #
        data = json.load(f)
        
    # pull out messages for dataframe conversion
    messages = [x for x in data['messages']]
    
    df = pd.DataFrame(messages)
    return df

In [None]:
def person(df, name):
    # this assumes you've already created chat_df
    unique_names = df['sender_name'].unique()
    if name in unique_names:
        return df[df['sender_name'] == name]
    else:
        #print name, 'isn\'t in the DataFrame. \nTry one of these:', ', '.join(unique_names.astype(str))
        return pd.DataFrame(columns=test_df.columns)

In [None]:
def check_this(df, max_limit=100000000, min_limit=0):
    len_num = len(person(df, my_name))
    # check all of these things in order of 'ew, bye'ness:
    # * if Ben is in the conversation
    # * if I've said very little in the conversation (less than 100?)
    if (('Ben Knight' in set(df['sender_name']))
        or (len_num >= max_limit)
        or (len_num < min_limit)
        ):
        return False
    return True

In [None]:
def generate_wordlist(file_name, name, stop_file=False, way='common'):
    # Let's parse our data.
    chat_df_raw = convert_filename_to_df(file_name)
    chat_df = clean_dataframe(chat_df_raw)

    # Pull out only my chats
    my_chats = person(chat_df, name)

    # pick name_word. if it's you, use 'you'. else, use their first name
    name_word = name.lower().split()[0] if name != my_name else 'you' 
    sent_str = '{} sent a'.format(name_word)
    
    # pull only my sentences out
    convos = my_chats[my_chats['content'].apply(lambda x: sent_str not in x)]['content']
    
    # unicode word list
    unicode_list = " ".join(convos.str.lower()).split()
    
    # convert each word to a string and remove punctuation.
    sample_words = [x.encode('UTF8').translate(None, string.punctuation) for x in unicode_list if ('http' not in x) and (not x.startswith('www'))]
    
    # counter set of those words, removing empty spaces
    base_words = Counter([w for w in sample_words if w != ''])
    
    if way == 'common':
        return base_words
    elif way == 'count':
        return [(k, v) for k, v in base_words.iteritems()]
    elif way == 'percent':
        return [(k, round(1000.0*v/sum(base_words.values()),5)) for k, v in base_words.iteritems()]
            

In [None]:
file_name = main_file
name = my_name
stop_file=True
way='common'

# Let's parse our data.
chat_df_raw = convert_filename_to_df(file_name)
chat_df = clean_dataframe(chat_df_raw)

# Pull out only my chats
my_chats = person(chat_df, name)

# pick name_word. if it's you, use 'you'. else, use their first name
name_word = name.lower().split()[0] if name != my_name else 'you' 
sent_str = '{} sent a'.format(name_word)

# pull only my sentences out
convos = my_chats[my_chats['content'].apply(lambda x: sent_str not in x)]['content']

# unicode word list
unicode_list = " ".join(convos.str.lower()).split()

# convert each word to a string and remove punctuation.
sample_words = [x.encode('UTF8').translate(None, string.punctuation) for x in unicode_list if ('http' not in x) and (not x.startswith('www'))]

# counter set of those words, removing empty spaces
base_words = Counter([w for w in sample_words if w != ''])

In [None]:
sample_words2 = [x.encode('UTF8').translate(None, string.punctuation) for x in unicode_list if (x not in stops['word']) and ('http' not in x) and (not x.startswith('www'))]

In [None]:
base_words3 = Counter([w for w in sample_words if (w != '') and (w not in stops2['word'].values)])

In [None]:
stops2 = pd.read_csv('stop_file2.csv')

In [None]:
x = generate_wordlist(main_file, my_name, stops2).most_common(100)

# User Inputs

In [None]:
my_name = 'Cassie Beth'
main_name = 'Ben Knight'
main_file = 'BenKnight/message.json'
inbox_url = input("Where are the files located")

# Parsing the Messenger Data

In [None]:
# pull main dataset
main_df = convert_filename_to_df(main_file)
main_data = generate_wordlist(main_file, main_name)

In [None]:
# for each folder of conversations, pull the data.
# then, perform some checks to see if we want to use this dataframe or not.
#for num_limit in range(0,1001,50):
min_limit = 500
#max_limit = 500
count = {'good':0,'bad':0}

for human in os.listdir(inbox_url)[1:]:
    file_name = '/'.join([inbox_url, human, 'message.json'])
    #print file_name
    test_df = convert_filename_to_df(file_name)
    if check_this(test_df, min_limit=min_limit):
        count['good'] += 1
        #print '{}, len: {}'.format(human, len(test_df))
    else:
        count['bad'] += 1
        #print 'Bad: {}'.format(human)

#    print 'With message limit {}, {}'.format(num_limit, count)

In [None]:
# type(generate_wordlist(file_name, my_name)), type(generate_wordlist(file_name, my_name).most_common(5))

In [None]:
# for each folder of conversations, pull the data.
# then, perform some checks to see if we want to use this dataframe or not.
#for num_limit in range(0,1001,50):
min_limit = 500
#max_limit = 500
count = {'good':0,'bad':0}
full_tbl = pd.DataFrame(columns=['word'])
#mc_count = 100 .most_common(mc_count)

for human in os.listdir(inbox_url)[1:]:
    file_name = '/'.join([inbox_url, human, 'message.json'])
    #print file_name
    test_df = convert_filename_to_df(file_name)
    if check_this(test_df, min_limit=min_limit):
        test_words = [(k, v) for k, v in generate_wordlist(file_name, my_name, way='percent')]
        test_tbl = pd.DataFrame(test_words, columns=['word', human.split('_')[0]])
        full_tbl = pd.merge(full_tbl, test_tbl, on = 'word', how='outer')
        #print '{}, len: {}'.format(human, len(test_df))

print 'With message limit {}, {}'.format(num_limit, count)

In [None]:
full_tbl = full_tbl.set_index(full_tbl['word']).drop(['word'], axis=1).fillna(0)

In [None]:
full_tbl['total'] = full_tbl.mean(axis=1)

In [None]:
full_tbl = full_tbl.sort('total', ascending=False)

In [None]:
xx = pd.DataFrame(stop_tbl.reset_index()['word']).to_csv('stop_file.csv', index=False, header=True)

In [None]:
stop_tbl = full_tbl[full_tbl['total'] > full_tbl['total'].mean()]# + full_tbl['total'].std()]

In [None]:
x2 = pd.DataFrame(stop_tbl.reset_index()['word']).to_csv('stop_file2.csv', index=False, header=True)

In [None]:
# Fun Facts of Learning:
#
# # number of "words" I typed to Ben for the first year we talked.
# len([x.translate(None, string.punctuation) for x in sample_words])
# > 231,132
#
# # number of unique words with punctuation
# len(set(sample_words))
# > 24,259
#
# # number of unique words, no punctuation!
# len(set([x.translate(None, string.punctuation) for x in sample_words]))
# > 14,176

# Fun Facts:
# strip out punctuation
# strip out http links

# think about:
# \xc3\xa3\xc2\xa1
# 3 instead of <3
# misspellings
# long one sentence, many words
