# Model Building

The goal of this notebook is to clean the data from the raw Facebook chat dump and build a model to predict which of my friends is most likely to have made a given statement.

In [None]:
import re

In [2]:
!ls -l

total 2164
-rw-rw-r-- 1 ubuntu ubuntu 2206374 Aug  7 20:35 cleaned1.txt
-rw-rw-r-- 1 ubuntu ubuntu      51 Aug  7 20:31 environment.yml
-rw-rw-r-- 1 ubuntu ubuntu     917 Aug  7 20:34 Facebook Machine Learning.ipynb


In [5]:
raw_data = open("cleaned1.txt").read()

In [8]:
print(raw_data[:500])

<div class="message"><div class="message_header"><span class="user">Chris Dailey</span><span class="meta">Thursday, April 2, 2015 at 7:03pm EDT</span></div></div><p>a) training can make you resistant to torture, but everyone breaks except for the .01% who can beat it</p><div class="message"><div class="message_header"><span class="user">Chris Dailey</span><span class="meta">Thursday, April 2, 2015 at 7:02pm EDT</span></div></div><p>well</p><div class="message"><div class="message_header"><span c


In [11]:
cleaned_data = re.sub("<div class=\"message\">", "", raw_data)

In [12]:
print(cleaned_data[:500])

<div class="message_header"><span class="user">Chris Dailey</span><span class="meta">Thursday, April 2, 2015 at 7:03pm EDT</span></div></div><p>a) training can make you resistant to torture, but everyone breaks except for the .01% who can beat it</p><div class="message_header"><span class="user">Chris Dailey</span><span class="meta">Thursday, April 2, 2015 at 7:02pm EDT</span></div></div><p>well</p><div class="message_header"><span class="user">Matthew Lotz</span><span class="meta">Thursday, Apr


In [27]:
messages = re.split("<div class=\"message_header\">", cleaned_data)

In [28]:
messages = messages[1:]
#I guess Facebook only stores the last 10k messages in a group chat.
len(messages)

10000

In [30]:
messages_sample = messages[:10]

In [31]:
messages_sample

['<span class="user">Chris Dailey</span><span class="meta">Thursday, April 2, 2015 at 7:03pm EDT</span></div></div><p>a) training can make you resistant to torture, but everyone breaks except for the .01% who can beat it</p>',
 '<span class="user">Chris Dailey</span><span class="meta">Thursday, April 2, 2015 at 7:02pm EDT</span></div></div><p>well</p>',
 '<span class="user">Matthew Lotz</span><span class="meta">Thursday, April 2, 2015 at 7:02pm EDT</span></div></div><p>not much of an officer, than</p>',
 '<span class="user">Chris Dailey</span><span class="meta">Thursday, April 2, 2015 at 7:02pm EDT</span></div></div><p>I would have immediately agreed that there are five lights</p>',
 '<span class="user">Matthew Lotz</span><span class="meta">Thursday, April 2, 2015 at 6:59pm EDT</span></div></div><p>At the end, when Picard is being released, he screams it at the guy before hobbling away</p>',
 '<span class="user">Matthew Lotz</span><span class="meta">Thursday, April 2, 2015 at 6:58pm ED

In [36]:
for message in messages_sample:
    parts = re.search("<span class=\"user\">(.*)</span><span class=\"meta\">.*</span></div></div><p>(.*)</p>", message)
    if parts:
        print(parts.groups())

('Chris Dailey', 'a) training can make you resistant to torture, but everyone breaks except for the .01% who can beat it')
('Chris Dailey', 'well')
('Matthew Lotz', 'not much of an officer, than')
('Chris Dailey', 'I would have immediately agreed that there are five lights')
('Matthew Lotz', 'At the end, when Picard is being released, he screams it at the guy before hobbling away')
('Matthew Lotz', 'He keeps offering nicer and nicer things if only Picard will agree that there are 5 lights')
('Matthew Lotz', 'by torturing him everytime he says that there are four lights (which there are)')
('Matthew Lotz', 'the scary dude is trying to break Picard')
('Matthew Lotz', 'maybe even a two-parter')
('Matthew Lotz', "It's a whole episode")


In [52]:
labels = []
docs = []

for message in messages:
    tuples = re.search("<span class=\"user\">(.*)</span><span class=\"meta\">.*</span></div></div><p>(.*)</p>", message)
    if tuples:
        labels.append(tuples.group(1))
        docs.append(tuples.group(2))


In [53]:
labels[:20]

['Chris Dailey',
 'Chris Dailey',
 'Matthew Lotz',
 'Chris Dailey',
 'Matthew Lotz',
 'Matthew Lotz',
 'Matthew Lotz',
 'Matthew Lotz',
 'Matthew Lotz',
 'Matthew Lotz',
 'Benjamin McCall',
 'Benjamin McCall',
 'Chris Dailey',
 'Chris Dailey',
 'Benjamin McCall',
 'Matthew Lotz',
 'Matthew Lotz',
 'Matthew Lotz',
 'Matthew Lotz',
 'Benjamin McCall']

In [54]:
docs[:20]

['a) training can make you resistant to torture, but everyone breaks except for the .01% who can beat it',
 'well',
 'not much of an officer, than',
 'I would have immediately agreed that there are five lights',
 'At the end, when Picard is being released, he screams it at the guy before hobbling away',
 'He keeps offering nicer and nicer things if only Picard will agree that there are 5 lights',
 'by torturing him everytime he says that there are four lights (which there are)',
 'the scary dude is trying to break Picard',
 'maybe even a two-parter',
 "It's a whole episode",
 '',
 'Matt. We need to make Chris watch TNG',
 "less dramatically, presumably because I haven't seen this scene",
 "I would've also said four",
 'no, there are five',
 'LIGHTS!',
 'FOUR',
 'ARE',
 'THERE',
 'Matt....... how many lights do you see']

In [57]:
for pair in zip(labels[:20], docs[:20]):
    print(pair)

('Chris Dailey', 'a) training can make you resistant to torture, but everyone breaks except for the .01% who can beat it')
('Chris Dailey', 'well')
('Matthew Lotz', 'not much of an officer, than')
('Chris Dailey', 'I would have immediately agreed that there are five lights')
('Matthew Lotz', 'At the end, when Picard is being released, he screams it at the guy before hobbling away')
('Matthew Lotz', 'He keeps offering nicer and nicer things if only Picard will agree that there are 5 lights')
('Matthew Lotz', 'by torturing him everytime he says that there are four lights (which there are)')
('Matthew Lotz', 'the scary dude is trying to break Picard')
('Matthew Lotz', 'maybe even a two-parter')
('Matthew Lotz', "It's a whole episode")
('Benjamin McCall', '')
('Benjamin McCall', 'Matt. We need to make Chris watch TNG')
('Chris Dailey', "less dramatically, presumably because I haven't seen this scene")
('Chris Dailey', "I would've also said four")
('Benjamin McCall', 'no, there are five')
(