In [1]:
import re
import os
import random
import pandas as pd
from collections import Counter

In [2]:
base_dir = 'data/cornel_movie'
lines_dir = os.path.join(base_dir, 'movie_lines.txt')
convs_dir = os.path.join(base_dir, 'movie_conversations.txt')

In [3]:
lines = open(lines_dir, encoding='utf-8', errors='ignore').read().strip().split('\n')
conv_lines = open(convs_dir, encoding='utf-8', errors='ignore').read().strip().split('\n')

In [4]:
lines[:10]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?']

In [5]:
conv_lines[:10]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']"]

In [6]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [7]:
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = clean_text(_line[4]).split()

In [24]:
len(id2line)

304713

In [8]:
convs = []
for line in conv_lines:
    convs.append(line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","").split(","))

In [9]:
lengths = list(map(len, id2line.values()))
lengths = pd.DataFrame(lengths, columns=['counts'])
lengths.describe()

Unnamed: 0,counts
count,304713.0
mean,10.986466
std,12.405579
min,0.0
25%,4.0
50%,7.0
75%,14.0
max,555.0


In [23]:
min_length = 2
max_length = 20

clean_questions = []
clean_answers = []
total_cnt = 0
for conv in convs:
    for i in range(len(conv) - 1):
        cur_q = id2line[conv[i]]
        cur_a = id2line[conv[i+1]]
        total_cnt += 1
        if min_length <= len(cur_q) <= max_length and min_length <= len(cur_a) <= max_length:
            clean_questions.append(cur_q)
            clean_answers.append(cur_a)

In [22]:
for i in range(5):
    print(clean_questions[i])
    print(clean_answers[i])
    print()

['well', 'i', 'thought', 'we', 'would', 'start', 'with', 'pronunciation', 'if', 'that', 'is', 'okay', 'with', 'you']
['not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', 'please']

['not', 'the', 'hacking', 'and', 'gagging', 'and', 'spitting', 'part', 'please']
['okay', 'then', 'how', 'about', 'we', 'try', 'out', 'some', 'french', 'cuisine', 'saturday', 'night']

['you', 'are', 'asking', 'me', 'out', 'that', 'is', 'so', 'cute', 'that', 'is', 'your', 'name', 'again']
['forget', 'it']

['gosh', 'if', 'only', 'we', 'could', 'find', 'kat', 'a', 'boyfriend']
['let', 'me', 'see', 'what', 'i', 'can', 'do']

["c'esc", 'ma', 'tete', 'this', 'is', 'my', 'head']
['right', 'see', 'you', 'are', 'ready', 'for', 'the', 'quiz']



In [11]:
print(len(clean_questions))
print(len(clean_answers))

138335
138335


In [12]:
# Compare the number of lines we will use with the total number of lines.
print("# of questions:", len(clean_questions))
print("# of answers:", len(clean_answers))
print("% of data used: {}%".format(round(len(clean_questions)/total_cnt,3)*100))

# of questions: 138335
# of answers: 138335
% of data used: 62.4%


In [13]:
vocabs = {}
data = []
for line in clean_questions + clean_answers:
    data.extend(line)
counter = Counter(data)
len(counter)

45618

In [14]:
count_pairs = counter.most_common()
print(count_pairs[5000:5050])

[('somewhat', 18), ('butler', 18), ('cannon', 18), ('pulls', 18), ('cliff', 18), ('absence', 18), ('violation', 18), ('vanished', 18), ('rep', 18), ('dentist', 18), ('effects', 18), ('reception', 18), ('leather', 18), ('ronald', 18), ('creative', 18), ('believing', 18), ('risky', 18), ('graduated', 18), ('gracious', 18), ('entertain', 18), ('interrogation', 18), ('sandra', 18), ('picnic', 18), ("lady's", 18), ('queens', 18), ('mademoiselle', 18), ('que', 18), ('stink', 18), ('bean', 18), ("mike's", 18), ('fucks', 18), ('constant', 18), ('float', 18), ('adopted', 18), ('possessed', 18), ('budget', 18), ('legend', 18), ('farther', 18), ('journal', 18), ('inner', 18), ('excitement', 18), ('crippled', 18), ('memphis', 18), ('sack', 18), ('resist', 18), ("how'm", 18), ('fence', 18), ('chimera', 18), ('natalie', 18), ('gravy', 18)]


In [15]:
print(count_pairs[8000:8050])

[('mo', 10), ('plaza', 10), ('scrape', 10), ('sew', 10), ('udo', 10), ("'way", 10), ('peed', 10), ('meurice', 10), ('skeptical', 10), ('utheyu', 10), ('braslow', 10), ('carolyn', 10), ('corky', 10), ('surgical', 10), ('scooter', 10), ('layton', 10), ('length', 10), ('swicker', 10), ('medium', 10), ('millie', 10), ('braces', 10), ('spooky', 10), ('relieve', 10), ('80', 10), ('jeanlouis', 10), ("brynner's", 10), ('quaint', 10), ('sessions', 10), ('harris', 10), ('twofifty', 10), ('committing', 10), ("number's", 10), ('peking', 10), ('jealousy', 10), ('homeless', 10), ('consumer', 10), ('meeks', 10), ('nuwanda', 10), ('doucet', 10), ('lynn', 10), ('lingerie', 10), ('blender', 10), ('depinto', 10), ('stuffed', 10), ('excessive', 10), ('skywalker', 10), ('vegetarian', 10), ('miniature', 10), ('barber', 10), ('satch', 10)]


In [16]:
vocabs = ['<SOS>', '<EOS>', '<UNK>']
count_pairs = counter.most_common(8000-len(vocabs))
words, _ = list(zip(*count_pairs))
vocabs += words
len(vocabs)

8000

In [17]:
word_to_id = dict(zip(vocabs, range(len(vocabs))))

In [18]:
question_to_id = []
for question in clean_questions:
    question_to_id.append([word_to_id[x] if x in word_to_id else word_to_id['<UNK>'] for x in question])
    
answer_to_id = []
for answer in clean_answers:
    answer = ['<SOS>'] + answer + ['<EOS>']
    answer_to_id.append([word_to_id[x] if x in word_to_id else word_to_id['<UNK>'] for x in answer])

In [20]:
for i in range(5):
    print(question_to_id[i])
    print(answer_to_id[i])
    print()

[53, 4, 128, 21, 35, 306, 38, 2, 54, 12, 6, 100, 38, 3]
[0, 7, 5, 2, 18, 2, 18, 6646, 396, 140, 1]

[7, 5, 2, 18, 2, 18, 6646, 396, 140]
[0, 100, 88, 43, 37, 21, 230, 49, 82, 1036, 2, 1521, 150, 1]

[3, 13, 474, 16, 49, 12, 6, 45, 985, 12, 6, 28, 165, 164]
[0, 294, 9, 1]

[1741, 54, 127, 21, 77, 148, 3869, 10, 1022]
[0, 123, 16, 68, 14, 4, 52, 11, 1]

[2, 889, 2, 24, 6, 30, 355]
[0, 57, 68, 3, 13, 354, 27, 5, 2, 1]



In [19]:
total_words = 0
unk_words = 0
for line in question_to_id + answer_to_id:
    for word in line:
        total_words += 1
        if word == word_to_id['<UNK>']:
            unk_words += 1
            
unk_ratio = round(unk_words/total_words,4)*100
    
print("Total number of words:", total_words)
print("Number of times <UNK> is used:", unk_words)
print("Percent of words that are <UNK>: {}%".format(round(unk_ratio,3)))

Total number of words: 2472868
Number of times <UNK> is used: 93386
Percent of words that are <UNK>: 3.78%
