# How to make a Facebook Chatbot that talks like you

We're going to first need to mess with the data that we have, for that we'll need Pandas and NumPy.

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('threads.csv')
df.head()

Now let's take a look at the metadata

In [None]:
print('{:,} messages'.format(df.index.shape[0]))
print('{:,} threads (not counting unknown)'.format(np.unique(df['thread'].values).shape[0]))
print('{:,} unique users (not counting unknown)'.format(np.unique(df['sender'].values).shape[0]))
print('{:,} threads with people you have blocked'.format(np.where(df['thread'].values == 'Facebook User')[0].shape[0]))
print('{:,} messages that lack senders'.format(np.where(df['sender'].values == 'Unknown')[0].shape[0]))

**Good looks!** Next let's spy on the senders and make sure everything looks good. You'll notice that I look at the top 11 instead of the top 10--this is because the **top** sender will be me since I participate in _all_ my convos!

In [None]:
# Top 10 people I talk to
z = df.groupby('sender').size().sort_values(ascending=False).head(11)
list(z.index)

In [None]:
# Top 20 threads
tt10 = df.groupby('thread').size().sort_values(ascending=False).head(23).index
top10threads = list(np.concatenate((tt10[:6], tt10[7:15], tt10[16:17], tt10[18:])))
print(top10threads)
len(top10threads)

You'll notice that above I messed with the array a little to skip some threads. This is because some of the threads that showed up were group chats with tons of spam and very low signal--plus, most of the replies aren't me, and there isn't consistency in speakers. As a result, I clean those out.

**Next, we'll aggregate the messages so it alternates senders. This simply requires us to combine consecutive messages from one sender.**

In [None]:
msgs = None
dots = dict()
photos = dict()
for thread in top10threads:
    thread_df = df.groupby('thread').get_group(thread)
    thread_df['message'].fillna('', inplace=True)
    thread_df['sender'].fillna('', inplace=True)
    thread_df['use'] = 0
    SENDER = 1
    MESSAGE = 3
    USE = -1
    convo = thread_df.values
    for i, msg in enumerate(convo):
        if i == 0: continue
        if len(msg[MESSAGE].replace('.','').replace(' ','')) < 6 and 'yes' not in msg[MESSAGE] and 'no' not in msg[MESSAGE]:
            if msg[SENDER] not in dots: dots[msg[SENDER]] = 0
            dots[msg[SENDER]] += 1
            convo[i][MESSAGE] = ''
            continue
        elif 'image reference' in msg[MESSAGE]:
            if msg[SENDER] not in photos: photos[msg[SENDER]] = 0
            photos[msg[SENDER]] += 1
            convo[i][MESSAGE] = ''
            continue
        elif msg[SENDER] == convo[i-1][SENDER]:
            if convo[i-1][MESSAGE]: convo[i][MESSAGE] = (convo[i-1][MESSAGE] + '. ' + convo[i][MESSAGE]).replace('\n', '. ')
        else:
            if convo[i-1][MESSAGE]: convo[i-1][USE] = 1
    data = convo[convo[:, USE] == 1]
    print("Retrieved message history, shape %s" % str(data.shape))
    if msgs is not None: msgs = np.concatenate((msgs, data), axis=0)
    else: msgs = data

print('Dot distribution: \n' + '\n'.join(['%s: %d' % (x, y) for x, y in sorted(list(dots.items()), key=lambda x: x[1])[::-1]]))
print('Photos distribution: \n' + '\n'.join(['%s: %d' % (x, y) for x, y in sorted(list(photos.items()), key=lambda x: x[1])[::-1]]))

Now that we have our data, let's write it out to a text file.

In [None]:
print("Built dataset of {:,} messages".format(msgs.shape[0]))

In [None]:
text = msgs[:, 3]
text[:10]

In [None]:
import os
from tqdm import tqdm

In [None]:
with open('conversationData.txt', 'w') as f:
    for _t in tqdm(text):
        f.write(_t+'\n')

In [None]:
with open('conversationData.txt', 'r') as f:
    a, b = len(f.readlines()), len(text)
    assert a >= b, "`data.txt` contains %d messages but there are at least %d messages in dataset!" % (a, b)

In [None]:
def cnk(l, n):
    for i in range(0, len(l) - (len(l) % n), n):
        yield l[i:i + n]
pairs = np.array([(m, r) for m,r in cnk(text[:200000], 2)])
np.save('conversationDictionary.npy', pairs)

In [None]:
assert np.load('conversationDictionary.npy').shape == pairs.shape, "didnt save correctly"
print(pairs.shape)

And now we're done! Head over to the training notebook.