In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import string, os 
import re
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from scipy import spatial

In [2]:
# reading dataset
df = pd.read_csv('../input/chatbot-dataset-topical-chat/topical_chat.csv')
df

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpfu...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper
...,...,...,...
188373,8628,"Wow, it does not seem like that long. Since I...",Surprised
188374,8628,"I havent seen that episode, I might google it...",Curious to dive deeper
188375,8628,I don't think I have either. That's an insane...,Curious to dive deeper
188376,8628,"I did, my little brother used to love Thomas ...",Happy


In [3]:
# dropping duplicates
df.drop_duplicates(subset=['conversation_id', 'message'], inplace=True)

In [4]:
df.count()

conversation_id    188336
message            188336
sentiment          188336
dtype: int64

In [5]:
# basic preprocessing
def process(text):
    text = text.lower().replace('\n', ' ').replace('-', ' ').replace(':', ' ').replace(',', '') \
          .replace('"', '').replace("...", ".").replace("..", ".").replace("!", ".").replace("?", "").replace(";", ".").replace(":", " ")

    #text = "".join(v for v in text if v not in string.punctuation).lower()
    #text = text.encode("utf8").decode("ascii",'ignore')

    text = " ".join(text.split())
    #text+="<eos>"
    return text

In [6]:
df.message = df.message.apply(process)

In [7]:
df.head()

Unnamed: 0,conversation_id,message,sentiment
0,1,are you a fan of google or microsoft,Curious to dive deeper
1,1,both are excellent technology they are helpful...,Curious to dive deeper
2,1,i'm not a huge fan of google but i use it a lo...,Curious to dive deeper
3,1,google provides online related services and pr...,Curious to dive deeper
4,1,yeah their services are good. i'm just not a f...,Curious to dive deeper


In [8]:
# Vectorize the data.
input_texts = []
target_texts = []

for conversation_index in tqdm(range(df.shape[0])):
    
    if conversation_index == 0:
        continue
        
    input_text = df.iloc[conversation_index - 1]
    target_text = df.iloc[conversation_index]
    
    """
    if "." in question:
        if question.index(".") <= len(question):
            question = question.split(".")[len(question.split(".")) - 1].strip()
            
            
    
    if "." in answer:
        if answer.index(".") <= len(answer):
            answer = answer.split('.')[0].strip()
    
    answer = "\t" + answer + "\n"
    """
    if input_text.conversation_id == target_text.conversation_id:
        
        input_text = input_text.message
        target_text = target_text.message
        
        if len(input_text.split()) > 2 and \
            len(target_text.split()) > 0 and \
            len(input_text.split()) < 50 and \
            len(target_text.split()) < 10 and \
            input_text and \
            target_text:
                
            input_texts.append(input_text)
            target_texts.append(target_text)

100%|██████████| 188336/188336 [01:30<00:00, 2078.86it/s]


In [9]:
print(len(input_texts))
print(len(target_texts))

21861
21861


In [10]:
#saving
with open('input_texts.pickle', 'wb') as handle:
    pickle.dump(input_texts, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
#saving
with open('target_texts.pickle', 'wb') as handle:
    pickle.dump(target_texts, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
pip install --user annoy

Note: you may need to restart the kernel to use updated packages.


In [12]:
#tokenizing data

input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_texts) 
input_sequences = input_tokenizer.texts_to_sequences(input_texts)

target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_texts) 
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

In [13]:
def generate_padded_sequences(sequences):
    max_sequence_len = max([len(x) for x in sequences])
    sequences = np.array(tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
    return sequences, max_sequence_len

In [14]:
input_total_words = len(input_tokenizer.word_index) + 1
input_sequences, input_max_sequence_len = generate_padded_sequences(input_sequences)

In [15]:
# using annoy algorithm

from annoy import AnnoyIndex
import random

inputAnnoyIndex = AnnoyIndex(input_max_sequence_len, 'angular')  # Length of item vector that will be indexed
for i, row in enumerate(input_sequences):
    inputAnnoyIndex.add_item(i, row)

inputAnnoyIndex.build(100) # 100 trees
inputAnnoyIndex.save('input_annoy.ann')

True

In [16]:
#inputAnnoyIndex = AnnoyIndex(input_max_sequence_len, 'angular')
#inputAnnoyIndex.load('input_annoy.ann') # super fast, will just mmap the file
#print(u.get_nns_by_item(0, 100)) # will find the 1000 nearest neighbors

In [17]:
query_index = 7
neighbor_index, distances = inputAnnoyIndex.get_nns_by_item(query_index, 10, include_distances=True)

print(input_texts[query_index])
print("\n-------------Nearest Neighbours-------------\n")
for i in range(len(neighbor_index)):
    print(input_texts[neighbor_index[i]], "--------- neighbor_index -", neighbor_index[i], ", distance -", distances[i])

hi. do you like to dance

-------------Nearest Neighbours-------------

hi do you like to dance --------- neighbor_index - 5446 , distance - 0.0
hi do you like to travel --------- neighbor_index - 9930 , distance - 0.029285771772265434
hi do you like to travel --------- neighbor_index - 10582 , distance - 0.029285771772265434
absolutely cool. great chat. go cowboys. --------- neighbor_index - 20024 , distance - 0.06294306367635727
yep and it is derived from the latin word congressus. --------- neighbor_index - 13936 , distance - 0.07048922032117844
hi there do you like dramas --------- neighbor_index - 11006 , distance - 0.08256812393665314
are you sure hip doesn't already have an entry --------- neighbor_index - 17554 , distance - 0.09979437291622162
true but when else do you hear about qatar. --------- neighbor_index - 12723 , distance - 0.10811832547187805
yeah that is really cool i didnt know that for the longest. --------- neighbor_index - 19131 , distance - 0.1142072081565857
i e

In [18]:
a = [-1] * 5
a[3] = 34
a

[-1, -1, -1, 34, -1]

In [19]:
"""input_similarity_matrix = list()

for i in tqdm(range(len(input_texts))):
    neighbor_index, distances = inputAnnoyIndex.get_nns_by_item(i, len(input_texts), include_distances=True)
    
    input_similarity_row = [-1] * len(input_texts)
    for index in range(len(neighbor_index)):
        j = neighbor_index[index]
        input_similarity_row[j] = distances[index]
            
    input_similarity_matrix.append(input_similarity_row)
"""

'input_similarity_matrix = list()\n\nfor i in tqdm(range(len(input_texts))):\n    neighbor_index, distances = inputAnnoyIndex.get_nns_by_item(i, len(input_texts), include_distances=True)\n    \n    input_similarity_row = [-1] * len(input_texts)\n    for index in range(len(neighbor_index)):\n        j = neighbor_index[index]\n        input_similarity_row[j] = distances[index]\n            \n    input_similarity_matrix.append(input_similarity_row)\n'

In [20]:
# generating similarity matrix for input texts
with open('input_similarity_matrix.txt', 'w') as input_similarity_matrix_file:
    for i in tqdm(range(len(input_texts))):
        neighbor_index, distances = inputAnnoyIndex.get_nns_by_item(i, len(input_texts), include_distances=True)

        input_similarity_row = [-1] * len(input_texts)
        for index in range(len(neighbor_index)):
            j = neighbor_index[index]
            input_similarity_row[j] = distances[index]
        
        
        input_similarity_matrix_file.write(str(input_similarity_row))
        if i != len(input_texts) - 1:
            input_similarity_matrix_file.write("\n")
            

input_similarity_matrix_file.flush()
input_similarity_matrix_file.close()

100%|██████████| 21861/21861 [1:45:59<00:00,  3.44it/s]


ValueError: I/O operation on closed file.

In [21]:
target_total_words = len(target_tokenizer.word_index) + 1
target_sequences, target_max_sequence_len = generate_padded_sequences(target_sequences)

targetAnnoyIndex = AnnoyIndex(target_max_sequence_len, 'angular')  # Length of item vector that will be indexed
for i, row in enumerate(target_sequences):
    targetAnnoyIndex.add_item(i, row)

targetAnnoyIndex.build(100) # 100 trees
targetAnnoyIndex.save('target_annoy.ann')

query_index = 7
neighbor_index, distances = targetAnnoyIndex.get_nns_by_item(query_index, 10, include_distances=True)

print(target_texts[query_index])
print("\n-------------Nearest Neighbours-------------\n")
for i in range(len(neighbor_index)):
    print(target_texts[neighbor_index[i]], "--------- neighbor_index -", neighbor_index[i], ", distance -", distances[i])

i love to dance a lot. how about you

-------------Nearest Neighbours-------------

i love to dance a lot. how about you --------- neighbor_index - 7 , distance - 0.0
sure. i mostly like netflix. how about you --------- neighbor_index - 8818 , distance - 0.09169900417327881
i know a little i guess. how about you. --------- neighbor_index - 12472 , distance - 0.1020965501666069
no i haven't. i heard it was good. --------- neighbor_index - 1940 , distance - 0.1167769581079483
sure. what kind of music do you like --------- neighbor_index - 14184 , distance - 0.16140127182006836
so true. what kinds of entertainment do you like. --------- neighbor_index - 6861 , distance - 0.17580711841583252
that is nuts. that 70s show was great. --------- neighbor_index - 20996 , distance - 0.17930185794830322
i even have old 78 and 33 1/3 albums --------- neighbor_index - 4943 , distance - 0.1824990063905716
i have not watched that show. is it good --------- neighbor_index - 9091 , distance - 0.182571172

In [22]:
# generating similarity matrix for target
with open('target_similarity_matrix.txt', 'w') as target_similarity_matrix_file:
    for i in tqdm(range(len(target_texts))):
        neighbor_index, distances = targetAnnoyIndex.get_nns_by_item(i, len(target_texts), include_distances=True)

        target_similarity_row = [-1] * len(target_texts)
        for index in range(len(neighbor_index)):
            j = neighbor_index[index]
            target_similarity_row[j] = distances[index]
        
        
        target_similarity_matrix_file.write(str(target_similarity_row))
        if i != len(target_texts) - 1:
            target_similarity_matrix_file.write("\n")
            

target_similarity_matrix_file.flush()
target_similarity_matrix_file.close()

100%|██████████| 21861/21861 [2:14:52<00:00,  2.70it/s]


ValueError: I/O operation on closed file.