In [1]:
import pandas as pd
import os
import re
import unicodedata

In [2]:
corpus = "movie_corpus"
corpus_name = "movie_corpus"
datafile = os.path.join("..", "data", corpus,  "formatted_movie_lines.txt")
datafile
with open(datafile, "rb") as file:
    lines = file.readlines()
    for line in lines[:10]:
        print(str(line), "\n")
    


b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n" 

b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n" 

b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n" 

b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n" 

b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n" 

b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n" 

b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\n" 

b'Why?\tUnsolved myster

In [3]:
# building vocabulary

In [4]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocabulary():
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            PAD_token: "PAD",
            SOS_token: "SOS",
            EOS_token: "EOS",
        }
        self.num_words = 3
        
    def addWord(self, w):
        if w not in self.word2index:
            self.word2index[w] = self.num_words
            self.word2count[w] = 1
            self.index2word[self.num_words] = w
            
            self.num_words += 1
        else:
            self.word2count[w] += 1
            
            
    def addSentence(self, sent):
        for word in sent.split(" "):
            self.addWord(word)
            
    def trim(self, min_cnt):
        if self.trimmed:
            return
        self.trimmed = True
        words_to_keep = []
        for k, v in self.word2count.items():
            if v >=  min_cnt:
                words_to_keep.append(k)
                
        # re build       
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            PAD_token: "PAD",
            SOS_token: "SOS",
            EOS_token: "EOS",
        }
        self.num_words = 3
        
        for w in words_to_keep:
            self.addWord(w)
            
            
            

In [5]:
# Load data


In [6]:
def unicodeToAscii(s):
    return  ''.join(c for c in unicodedata.normalize('NFD', s) 
                   if unicodedata.category(c) != 'Mn')
def cleanString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return  s

def readVocs(datafile, corpus_name):
    lines  = open(datafile,
                  encoding = 'utf-8'
                 ).read().strip().split("\n")
    pairs = [[cleanString(s) for s in l.split("\t")] 
             for l in lines]
    
    voc = Vocabulary(corpus_name)
    return voc, pairs
    


In [7]:
# for sake of training, use only short sentences
def filterPair(p, max_length): 
    return len(p[0].split(" ")) < max_length and len(p[1].split(" ")) < max_length 

def filterPairs(pairs, max_length): 
    return [pair for pair in pairs if filterPair(pair, max_length)]



In [8]:
def loadData(corpus, corpus_name, datafile, max_length):
    voc, pairs = readVocs(datafile, corpus_name)
    print(f"{len(pairs)}  Sentence pairs")
    pairs = filterPairs(pairs, max_length)
    print(f"{len(pairs)}  Sentence pairs after trimming")
    
    for p in pairs:
        voc.addSentence(p[0])
        voc.addSentence(p[1])
    print(f"{voc.num_words}  distinct words in vocabilary")
    return voc, pairs

max_length = 10
voc, pairs = loadData(corpus, corpus_name, datafile, max_length)


221282  Sentence pairs
64271  Sentence pairs after trimming
18008  distinct words in vocabilary


In [9]:
print("Example pairs")
for pair in pairs[-10:]:
    print(pair)

Example pairs
['four', 'three minutes to go !']
['three minutes to go !', 'yes .']
['another fifteen seconds to go .', 'do something ! stall them !']
['yes sir name please ?', 'food !']
['food !', 'do you have a reservation ?']
['do you have a reservation ?', 'food ! !']
['grrrhmmnnnjkjmmmnn !', 'franz ! help ! lunatic !']
['what o clock is it mr noggs ?', 'eleven o clock my lorj']
['stuart ?', 'yes .']
['yes .', 'how quickly can you move your artillery forward ?']


In [10]:
#  remove rare words, so that we reduce complexity and less time to train, given  that we don't have a big dataset


In [11]:
def removeRareWords(voc, all_pairs, minimum):
    voc.trim(minimum)
    pairs_to_keep = []
    for p in all_pairs:
        keep = True
        for word in p[0].split(" "):
            if word not in voc.word2index:
                keep = False
                break
        for word in p[1].split(" "):
            if word not in voc.word2index:
                keep = False
                break
        if keep:
            pairs_to_keep.append(p)
    print(f"Trimmed from {len(all_pairs)} pairs to {len(pairs_to_keep)} {100*len(pairs_to_keep)/len(all_pairs)}")
    return  pairs_to_keep
        
    

In [12]:
minimum_count = 3
pairs = removeRareWords(voc, pairs, minimum_count)

Trimmed from 64271 pairs to 53165 82.72004481025657
