In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import string, os 
import re
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import gc

In [2]:
# reading dataset
df = pd.read_csv('../input/chatbot-dataset-topical-chat/topical_chat.csv')
df

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpfu...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper
...,...,...,...
188373,8628,"Wow, it does not seem like that long. Since I...",Surprised
188374,8628,"I havent seen that episode, I might google it...",Curious to dive deeper
188375,8628,I don't think I have either. That's an insane...,Curious to dive deeper
188376,8628,"I did, my little brother used to love Thomas ...",Happy


In [3]:
df.drop_duplicates(subset=['conversation_id', 'message'], inplace=True)

In [4]:
df.count()

conversation_id    188336
message            188336
sentiment          188336
dtype: int64

In [5]:
# basic preprocessing
def process(text):
    text = text.lower().replace('\n', ' ').replace('-', ' ').replace(':', ' ').replace(',', '') \
          .replace('"', '').replace("...", ".").replace("..", ".").replace("!", ".").replace("?", "").replace(";", ".").replace(":", " ")

    #text = "".join(v for v in text if v not in string.punctuation).lower()
    #text = text.encode("utf8").decode("ascii",'ignore')

    text = " ".join(text.split())
    #text+="<eos>"
    return text

In [6]:
df.message = df.message.apply(process)

In [7]:
df.head()

Unnamed: 0,conversation_id,message,sentiment
0,1,are you a fan of google or microsoft,Curious to dive deeper
1,1,both are excellent technology they are helpful...,Curious to dive deeper
2,1,i'm not a huge fan of google but i use it a lo...,Curious to dive deeper
3,1,google provides online related services and pr...,Curious to dive deeper
4,1,yeah their services are good. i'm just not a f...,Curious to dive deeper


In [8]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

for conversation_index, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    if conversation_index == 0:
        continue
        
    if len(input_texts) > 5000:
        continue
        
    input_text = df.iloc[conversation_index - 1]
    target_text = df.iloc[conversation_index]
    
    """
    if "." in question:
        if question.index(".") <= len(question):
            question = question.split(".")[len(question.split(".")) - 1].strip()
            
            
    
    if "." in answer:
        if answer.index(".") <= len(answer):
            answer = answer.split('.')[0].strip()
    
    answer = "\t" + answer + "\n"
    """
    if input_text.conversation_id == target_text.conversation_id:
        
        input_text = input_text.message
        target_text = target_text.message
        
        if len(input_text.split()) > 2 and \
            len(target_text.split()) > 0 and \
            len(input_text.split()) < 10 and \
            len(target_text.split()) < 10 and \
            input_text and \
            target_text:
                
            input_texts.append(input_text)
            target_texts.append(target_text)

            for char in input_text:
                if char not in input_characters:
                    input_characters.add(char)
            for char in target_text:
                if char not in target_characters:
                    target_characters.add(char)

100%|██████████| 188336/188336 [01:17<00:00, 2421.76it/s] 


In [9]:
print(len(input_texts))
print(len(target_texts))

5001
5001


In [10]:
#saving
with open('input_texts.pickle', 'wb') as handle:
    pickle.dump(input_texts, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
#saving
with open('target_texts.pickle', 'wb') as handle:
    pickle.dump(target_texts, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

#saving
with open('input_characters.pickle', 'wb') as handle:
    pickle.dump(input_characters, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
#saving
with open('target_characters.pickle', 'wb') as handle:
    pickle.dump(target_characters, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
pip install sent2vec

Collecting sent2vec
  Downloading sent2vec-0.2.0-py3-none-any.whl (5.2 kB)
Installing collected packages: sent2vec
Successfully installed sent2vec-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
from scipy import spatial
from sent2vec.vectorizer import Vectorizer
import time


start = time.process_time()
input_vectorizer = Vectorizer()
print("input_texts length - " + str(len(input_texts)))
gc.collect()
input_vectorizer.bert(input_texts)
input_vectors_bert = input_vectorizer.vectors
print("input_vectors_bert length - " + str(len(input_vectors_bert)))
print("input texts done in", time.process_time() - start, "seconds")

start = time.process_time()
target_vectorizer = Vectorizer()
print("target_texts length - " + str(len(target_texts)))
gc.collect()
target_vectorizer.bert(target_texts)
target_vectors_bert = target_vectorizer.vectors
print("target_vectors_bert length - " + str(len(target_vectors_bert)))
print("target texts done", time.process_time() - start, "seconds")

input_texts length - 5001


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

input_vectors_bert length - 5001
input texts done in 166.571231895 seconds
target_texts length - 5001
target_vectors_bert length - 5001
target texts done 166.63865307799998 seconds


In [13]:
def find_distance(index, vectors_bert):
    dist_list = list()
    for sentence_vector in vectors_bert:
        dist_list.append((spatial.distance.cosine(vectors_bert[index], sentence_vector)))
        
    dist_list = np.array(dist_list)
    return dist_list

In [14]:
query_index = 28
print(input_texts[query_index], "->", target_texts[query_index])
print()

nearest_k = 10
input_distance_list = find_distance(query_index, input_vectors_bert)
target_distance_list = find_distance(query_index, target_vectors_bert)

for similar_index in input_distance_list.argsort()[:nearest_k]:
    if query_index!=similar_index:
        print(input_texts[similar_index], "->", target_texts[similar_index])
        
answers = set()
for similar_index in target_distance_list.argsort():
    
    if similar_index < len(target_texts):
    
        if target_texts[similar_index].strip() != target_texts[query_index].strip():
            answers.add(target_texts[similar_index].strip())

        if len(answers) > 10:
            break

print()
print("Similar to query answer")
print(answers)

yeah it seems. it is owned by warner bros -> yeah they seem to have their hand on everything

true. did you know they own imdb -> yes since 1998 they just seem to be everywhere
nice. what's your favorite horror film -> insidious the last key this film is my favorite.
yeah for sure. have you seen the original version -> i think what i saw was the original.
that's really cool. do you like entertainment -> sure. do you play video games
yeah crazy. do you watch the nba -> i do. do you know who reggie miller is
yeah. do you remember the website neopets -> i do not whats that
nice. did you ever see the movie transformers -> some. which one
oh wow. you love the gory stuff -> sure
yeah. do you know who owns the site -> yes it is actually warner bros

Similar to query answer
{"i didn't know that what is it about", 'yeah i guess that makes sense quite a bit', 'yes since 1998 they just seem to be everywhere', 'yeah i know but they keep topping the charts', 'yes they have it in their code of condu

In [15]:
"""input_similarity_matrix = np.empty((len(input_vectors_bert), len(input_vectors_bert)))
for i in tqdm(range(len(input_vectors_bert))):
    input_distance_list = find_distance(i, input_vectors_bert)
    for j in range(len(input_vectors_bert)):
        if i==j:
            input_similarity_matrix[i, j] = 0
        elif input_similarity_matrix[j, i]:
            input_similarity_matrix[i, j] = input_similarity_matrix[j, i]
        else:
            input_similarity_matrix[i, j] = input_distance_list[j]
"""

'input_similarity_matrix = np.empty((len(input_vectors_bert), len(input_vectors_bert)))\nfor i in tqdm(range(len(input_vectors_bert))):\n    input_distance_list = find_distance(i, input_vectors_bert)\n    for j in range(len(input_vectors_bert)):\n        if i==j:\n            input_similarity_matrix[i, j] = 0\n        elif input_similarity_matrix[j, i]:\n            input_similarity_matrix[i, j] = input_similarity_matrix[j, i]\n        else:\n            input_similarity_matrix[i, j] = input_distance_list[j]\n'

In [16]:
# https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes
from multiprocessing import Process, Value, Array
import psutil
import multiprocessing as mp

# https://stackoverflow.com/questions/9754034/can-i-create-a-shared-multiarray-or-lists-of-lists-object-in-python-for-multipro
# https://stackoverflow.com/questions/7894791/use-numpy-array-in-shared-memory-for-multiprocessing/7908612#7908612

from multiprocessing import Process, Value, Array
import ctypes as c
import math

In [17]:
# https://stackoverflow.com/questions/5784389/using-100-of-all-cores-with-the-multiprocessing-module/35371568
def fill_similarity_matrix(batch_num, total_batches, similarity_matrix, vectors_bert):
    
    start_index = batch_num * math.floor(len(similarity_matrix)/total_batches)    
    if batch_num + 1 == total_batches:
        end_index = len(similarity_matrix)
    else:
        end_index = start_index + math.floor(len(similarity_matrix)/total_batches)
        
    desc = "Batch " + str(batch_num + 1) + " of " + str(total_batches) + ". Starting at " + str(start_index) + " End at " + str(end_index)
    #print(desc)
    for i in tqdm(range(start_index, end_index), desc=desc):
        distance_list = find_distance(i, vectors_bert)
        for j in range(len(vectors_bert)):
            if i==j:
                similarity_matrix[i][j] = 0
            elif similarity_matrix[j][i]:
                similarity_matrix[i][j] = similarity_matrix[j][i]
            else:
                similarity_matrix[i][j] = distance_list[j]


def split_fill_similarity_matrix(name, vectors_bert):          
    procs = list()
    n_cpus = psutil.cpu_count()

    shared_similarity_matrix = [Array(c.c_double, len(vectors_bert)) for _ in range(len(vectors_bert))]

    for cpu in range(n_cpus):
        p = Process(target=fill_similarity_matrix, args=(cpu, n_cpus, shared_similarity_matrix, vectors_bert))
        p.start()
        procs.append(p)

    for p in procs:
        p.join()
        print('processes joined')

    with open(name + '.txt', 'w') as f:
        for i in tqdm(range(len(shared_similarity_matrix)), desc="writing to file"):
            f.write(str(list(shared_similarity_matrix[i])))
            if i != len(shared_similarity_matrix) - 1:
                f.write("\n")
                
        print("done writing to file")

In [18]:
split_fill_similarity_matrix('input_similarity_matrix', input_vectors_bert)

Batch 3 of 4. Starting at 2500 End at 3750: 100%|██████████| 1250/1250 [16:40<00:00,  1.25it/s]
Batch 4 of 4. Starting at 3750 End at 5001: 100%|██████████| 1251/1251 [16:41<00:00,  1.25it/s]
Batch 1 of 4. Starting at 0 End at 1250: 100%|██████████| 1250/1250 [16:41<00:00,  1.25it/s]


processes joined


Batch 2 of 4. Starting at 1250 End at 2500: 100%|██████████| 1250/1250 [16:42<00:00,  1.25it/s]
writing to file:   0%|          | 8/5001 [00:00<01:04, 76.84it/s]

processes joined
processes joined
processes joined


writing to file: 100%|██████████| 5001/5001 [01:02<00:00, 80.46it/s]


done writing to file


In [19]:
import ast

# https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list
def convert_file_to_matrix(file_path):
    matrix_file = open(file_path, "r")
    matrix_file = matrix_file.read()
    matrix_file = matrix_file.split("\n")

    matrix = list()
    for row in matrix_file:
        matrix.append(ast.literal_eval(row))

    return matrix

In [20]:
#print(convert_file_to_matrix("./input_similarity_matrix.txt"))

In [21]:
split_fill_similarity_matrix('target_similarity_matrix', target_vectors_bert)

Batch 4 of 4. Starting at 3750 End at 5001: 100%|██████████| 1251/1251 [17:16<00:00,  1.21it/s]
Batch 3 of 4. Starting at 2500 End at 3750: 100%|██████████| 1250/1250 [17:18<00:00,  1.20it/s]
Batch 2 of 4. Starting at 1250 End at 2500: 100%|██████████| 1250/1250 [17:23<00:00,  1.20it/s]
Batch 1 of 4. Starting at 0 End at 1250: 100%|██████████| 1250/1250 [17:25<00:00,  1.20it/s]
writing to file:   0%|          | 9/5001 [00:00<00:57, 86.60it/s]

processes joined
processes joined
processes joined
processes joined


writing to file: 100%|██████████| 5001/5001 [01:01<00:00, 81.87it/s]


done writing to file
