In [3]:
import numpy as np
import pandas as pd
import re
import string

In [4]:
# Read lines from the dataset, as the seperators are special characters hence we do not use pandas
lines = open('movie_lines.txt', encoding ='utf-8', errors='ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding ='utf-8', errors='ignore').read().split('\n')
conversations[:10]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']"]

In [5]:
# Creating a dict that maps each line to id
id2line = {line.strip().split(' +++$+++ ')[0]: line.strip().split(' +++$+++ ')[-1] for line in lines }
id2line

{'L1045': 'They do not!',
 'L1044': 'They do to!',
 'L985': 'I hope so.',
 'L984': 'She okay?',
 'L925': "Let's go.",
 'L924': 'Wow',
 'L872': "Okay -- you're gonna need to learn how to lie.",
 'L871': 'No',
 'L870': 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869': 'Like my fear of wearing pastels?',
 'L868': 'The "real you".',
 'L867': 'What good stuff?',
 'L866': "I figured you'd get to the good stuff eventually.",
 'L865': 'Thank God!  If I had to hear one more story about your coiffure...',
 'L864': "Me.  This endless ...blonde babble. I'm like, boring myself.",
 'L863': 'What crap?',
 'L862': 'do you listen to this crap?',
 'L861': 'No...',
 'L860': 'Then Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."',
 'L699': 'You always been this selfish?',
 'L698': 'But',
 'L697': "Then that's all you had to say.",
 'L696': 'Well, no...',
 'L695': "You never wanted to go out with 'me, did y

In [6]:
# Creating a list of conversations containgin only line Ids
conversation_ids = [conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","").split(",") for conversation in conversations[:-1]]
conversation_ids[:10]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]

In [7]:
#seperate questions and answers
questions = []
answers = []
for conversation in conversation_ids:
    for tuple in list(zip(conversation, conversation[1:])):
        questions.append(id2line[tuple[0]])
        answers.append(id2line[tuple[1]])        

In [8]:
# Simple alternative way of performing the above operation
# questions = []
# answers = []
# for conversation in conversation_ids:
#     for i in range(len(conversation)-1):
#         questions.append(id2line[conversation[i]])
#         answers.append(id2line[conversation[i+1]])

In [9]:
def cleanText(data):
    data = data.lower()
    data = re.sub(r"i'm", "i am", data)
    data = re.sub(r"he's", "he is", data)
    data = re.sub(r"she's", "she is", data)
    data = re.sub(r"that's", "that is", data)
    data = re.sub(r"what's", "what is", data)
    data = re.sub(r"\'ll", "will", data)
    data = re.sub(r"\'ve", "have", data)
    data = re.sub(r"\'re", "are", data)
    data = re.sub(r"\'d", "would", data)
    data = re.sub(r"won't", "will not", data)
    data = re.sub(r"can't", "cannot", data)
    data = re.sub(r"-", " ", data) #added seperately as we need to replace this with space for seperation
    data = data.translate(str.maketrans("", "", string.punctuation)) #Remove punctuations
    data = " ".join(data.split()) #Remove multiple white spaces
    return data

In [10]:
questionsCleaned = [cleanText(text) for text in questions] 
answersCleaned = [cleanText(text) for text in answers] 


In [11]:
# Create a Dict which maps each word to its number of occurences
from collections import Counter,OrderedDict
questions2count = dict(Counter(' '.join(questionsCleaned).split()))
answers2count = dict(Counter(' '.join(answersCleaned).split()))


In [20]:
# Filter out least frequent words
threshold = 10
filteredQuestions2Count = dict(filter(lambda elem: elem[1] > threshold, questions2count.items()))
filteredAnswers2Count = dict(filter(lambda elem: elem[1] > threshold, answers2count.items()))

tokens = ['<PAD>','<EOS>', '<OUT>', '<SOS>']

# assign unique id to each word
questionInt2Words = dict(enumerate([*filteredQuestions2Count]+tokens))
answerInt2Words = dict(enumerate([*filteredAnswers2Count]+tokens))

# inverse Dict for answerWords2int
answerWords2int = {v:k for k,v in answerInt2Words.items()}

questionWords2int = {v:k for k,v in questionInt2Words.items()}

In [21]:
# adding end of string token to every answer
answersCleaned = [answer + ' <EOS>' for answer in answersCleaned]

In [24]:
# Translate all quetions and answers to int and replace filtered out words
questionsToInt = []
for question in questionsCleaned:
    ints = []
    for word in question.split():
        if word not in questionWords2int:
            ints.append(questionWords2int['<OUT>'])
        else:
            ints.append(questionWords2int[word])
    questionsToInt.append(ints)
    
answersToInt = []
for answer in answersCleaned:
    ints = []
    for word in answer.split():
        if word not in answerWords2int:
            ints.append(answerWords2int['<OUT>'])
        else:
            ints.append(answerWords2int[word])
    answersToInt.append(ints)

In [25]:
questionsToInt

[[0,
  1,
  2,
  3,
  4,
  8320,
  8320,
  5,
  6,
  8320,
  7,
  8,
  9,
  10,
  8320,
  11,
  12,
  13,
  14,
  15,
  8320,
  16],
 [17, 18, 19, 20, 21, 22, 8320, 23, 24, 25, 26, 22, 27],
 [28, 15, 8320, 5, 8320, 5, 8320, 29, 30],
 [31, 32, 33, 34, 24, 25, 35, 36, 37, 25, 38, 39, 16],
 [40, 40, 41, 42, 43, 1, 44, 45, 46, 47, 48],
 [49],
 [15,
  50,
  25,
  49,
  18,
  51,
  52,
  15,
  53,
  54,
  46,
  55,
  8320,
  56,
  54,
  57,
  42,
  58,
  18,
  59,
  60,
  61,
  62,
  63],
 [64],
 [8320,
  65,
  62,
  66,
  67,
  68,
  69,
  70,
  71,
  62,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  62,
  80,
  81,
  54,
  76,
  82,
  83],
 [84, 23, 85, 1, 86, 87, 88, 46, 89],
 [8320, 90, 8320, 3, 25, 42, 91],
 [92, 93, 31, 94, 95, 15, 8320],
 [18,
  96,
  97,
  67,
  98,
  99,
  67,
  100,
  24,
  101,
  18,
  97,
  67,
  98,
  102,
  103,
  79,
  104,
  15,
  105,
  106,
  7,
  99,
  107,
  63,
  108,
  109,
  110,
  79,
  111,
  18,
  45,
  112,
  113,
  42,
  114,
  115,
  67,
  11