# Whatsapp chat bot

In [65]:
def split_text(filename):
  '''Split file contents by newline.'''

  chat = open(filename)
  chat_text = chat.read()
  return chat_text.splitlines()

In [66]:
# Raw chat data
raw_chat = split_text('/Users/guybrett-robertson/Documents/data/whatsapp_chats/alison_chat.txt')
print(raw_chat[0:10])

['[04/05/2020, 11:51:07] Alison: \u200eMessages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.', '[04/05/2020, 11:51:07] Alison: Hey! Happy Monday!', '[04/05/2020, 14:03:58] Guy: Salut!', '[04/05/2020, 14:04:06] Guy: Happy Monday to you 😁 x', '[04/05/2020, 14:09:20] Guy: How are you?', '[04/05/2020, 15:26:34] Alison: Fine :) how are you?', '[04/05/2020, 15:27:48] Alison: Btw I was wondering what was that quote about the bourgeoisie on your profile? Haha', '[04/05/2020, 17:25:06] Guy: I’m well thanks! Just finished work and about to work out. I’ve started watching tv whole exercising and it’s great 😄', '[04/05/2020, 17:25:11] Guy: while*', '[04/05/2020, 17:25:45] Guy: Haha overthrowing the bourgeoisie is phrase that’s kinda thrown around by particularly left-leaning people']


In [67]:
n = len(raw_chat)
print(n)

26220


In [68]:
# List of messages from me
me_chat = []
# Corresponding list of responses to my messages
friend_chat = []

previous_row = [None, None]

# Iterate through all messages
for i in range(n):
    row = raw_chat[i]
    # Check that the row is not empty
    if len(row) != 0:
        # Check that the row is valid, excludes picture messages, etc.
        if row[0] == '[':
            # Remove time stamp
            row = row.split('] ')[1]
            # Specifies who sent the message
            row = row.split(': ')[0:2]
            # Only keep last message in a string of message from me, and my friend's first response to this
            if previous_row[0] == 'Guy' and row[0] != 'Guy':
                me_chat.append(previous_row[1])
                friend_chat.append(row[1])
            previous_row = row

In [69]:
n = len(me_chat)
print(n)

5349


In [70]:
for i in range(10):
    print('Me:', me_chat[i])
    print('Friend:', friend_chat[i])
    print()

Me: How are you?
Friend: Fine :) how are you?

Me: But it lets you know where I stand
Friend: What do you work in? I don't think I've asked 🙂

Me: There isn’t much choice haha
Friend: Oooo very interesting! How do you see the economy of the country going? The way I see it is quite pessimistic so I'd love to have your insights 😆

Me: When you say “once I’m back home”, do you mean Avignon?
Friend: Damnit 😅

Me: Yeah bouldering is just like climbing except without the ropes and harnesses, so you don’t go nearly as high
Friend: Ah so I've actually done that in London but called it the wrong way

Me: It’s so fun isn’t it? I had only been doing it for about 6 months before the lockdown
Friend: I know how hard it is so total respect haha it is really fun though

Me: Yeah it’s super hard 😅 always amazing when someone half my size can climb things I can’t
Friend: Haha 🤣 yeah it's very impressive!

Me: For sure, and I was just starting to get good when I had to stop 😤
Friend: Did you find a plac

In [71]:
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec

In [82]:
stop_words = stopwords.words('english')
stop_words = []

In [83]:
tagged_data = []
for i in range(n):
    message = word_tokenize(me_chat[i].lower())
    tagged_message = [word for word in message if word not in stop_words]
    tagged_data.append(TaggedDocument(words=tagged_message, tags = [i]))

In [84]:
print(tagged_data[0:5])

[TaggedDocument(words=['how', 'are', 'you', '?'], tags=[0]), TaggedDocument(words=['but', 'it', 'lets', 'you', 'know', 'where', 'i', 'stand'], tags=[1]), TaggedDocument(words=['there', 'isn', '’', 't', 'much', 'choice', 'haha'], tags=[2]), TaggedDocument(words=['when', 'you', 'say', '“', 'once', 'i', '’', 'm', 'back', 'home', '”', ',', 'do', 'you', 'mean', 'avignon', '?'], tags=[3]), TaggedDocument(words=['yeah', 'bouldering', 'is', 'just', 'like', 'climbing', 'except', 'without', 'the', 'ropes', 'and', 'harnesses', ',', 'so', 'you', 'don', '’', 't', 'go', 'nearly', 'as', 'high'], tags=[4])]


In [85]:
max_epochs = 100
vec_size = 100
alpha = 0.025

In [86]:
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

In [87]:
model.build_vocab(tagged_data)

In [88]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("doc2vec.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [55]:
#model= Doc2Vec.load("doc2vec.model")

In [127]:
test_data = 'How\'s it going?'
test_data = word_tokenize(test_data.lower())
print(test_data)
test_data = [word for word in test_data if word not in stop_words]
test_vector = model.infer_vector(test_data, epochs=1000)
similar_doc = model.docvecs.most_similar([test_vector])[0][0]
print(similar_doc)
print(me_chat[similar_doc])
print(friend_chat[similar_doc])

['how', "'s", 'it', 'going', '?']
1283
How’s it going? X
Bonjour toi! :)


In [90]:
test_data

['how', 'are', 'you', '?']

In [91]:
raw_chat

['[04/05/2020, 11:51:07] Alison: \u200eMessages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.',
 '[04/05/2020, 11:51:07] Alison: Hey! Happy Monday!',
 '[04/05/2020, 14:03:58] Guy: Salut!',
 '[04/05/2020, 14:04:06] Guy: Happy Monday to you 😁 x',
 '[04/05/2020, 14:09:20] Guy: How are you?',
 '[04/05/2020, 15:26:34] Alison: Fine :) how are you?',
 '[04/05/2020, 15:27:48] Alison: Btw I was wondering what was that quote about the bourgeoisie on your profile? Haha',
 '[04/05/2020, 17:25:06] Guy: I’m well thanks! Just finished work and about to work out. I’ve started watching tv whole exercising and it’s great 😄',
 '[04/05/2020, 17:25:11] Guy: while*',
 '[04/05/2020, 17:25:45] Guy: Haha overthrowing the bourgeoisie is phrase that’s kinda thrown around by particularly left-leaning people',
 '[04/05/2020, 17:26:20] Guy: There isn’t really much overthrowing going on while I’m stuck at home!',
 '[04/05/2020, 17:26:39] Guy: But it l

In [63]:
similar_doc = model.docvecs.most_similar([test_vector])[0][0]
print(similar_doc)

135


In [64]:
print(me_chat[similar_doc])
print(friend_chat[similar_doc])

Just
You've gotten way better as you've got older


In [126]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("where should we go tonight?".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

V1_infer [ 0.02786537 -0.15617043  0.09614474 -0.15351231  0.0056464   0.14709063
 -0.10719717 -0.20774604  0.25283945 -0.11388876  0.08232043 -0.02487154
 -0.18636256  0.06143482 -0.02890896 -0.07912687 -0.08651228  0.03683301
 -0.07879587 -0.02497978]
[('795', 0.7551185488700867), ('279', 0.7240581512451172), ('269', 0.7106707096099854), ('276', 0.7081595659255981), ('2604', 0.7076005935668945), ('1993', 0.703026533126831), ('90', 0.6891791820526123), ('2951', 0.6830059289932251), ('250', 0.6760359406471252), ('274', 0.6604530215263367)]
[ 2.6063082  -0.80071914  4.493682   -3.5328135   0.60080296  4.171874
  1.9757626  -0.9834134   1.4936322   1.0053214   1.4235753  -3.937983
 -0.03474484 -2.4155326   3.5768118  -1.1464969  -0.5091724   0.68507594
 -1.0770615   1.4298849 ]


In [128]:
similar_doc = model.docvecs.most_similar([v1])
print(similar_doc[0][0])

1379


In [132]:
me_chat[795]

'Becky is gonna go to Old Street and Jake and I will hang with you'

In [133]:
friend_chat[795]

'No 😩 now I feel terrible'