In [1]:
import os
import torch
import matplotlib.pyplot as plt
%matplotlib auto

Using matplotlib backend: Qt5Agg


In [2]:
def read_data_nmt():
    data_dir=r'F:\study\ml\DataSet\fra-eng'
    with open(os.path.join(data_dir,'fra.txt'),'r',encoding='utf-8') as f:
        return f.read()

In [3]:
raw_text=read_data_nmt()
print((raw_text[:75]))

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [4]:
def preprocess_nmt(text):
    def no_space(char,prev_char):
        return char in set(',.!?') and prev_char !=' '
    
    text=text.replace('\u202f', ' ').replace('\xa0',' ').lower()
    out=[' '+char if i>0 and no_space(char,text[i-1]) else char for i,char in enumerate(text) ]
    return ''.join(out)

In [5]:
text=preprocess_nmt(raw_text)
print(text[:80])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !


In [6]:
def tokenize_nmt(text,num_examples=None):
    source,target=[],[]
    for i ,line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts=line.split('\t')
        if len(parts)==2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source,target 
    

        

In [7]:
source,target=tokenize_nmt(text)

In [8]:
source[:6],target[:6]

([['go', '.'],
  ['hi', '.'],
  ['run', '!'],
  ['run', '!'],
  ['who', '?'],
  ['wow', '!']],
 [['va', '!'],
  ['salut', '!'],
  ['cours', '!'],
  ['courez', '!'],
  ['qui', '?'],
  ['ça', 'alors', '!']])

In [13]:
def show_list_len_pair_hist(legend,xlabel,ylabel,xlist,ylist):
    plt.figure(figsize=(6,8))
    _,_,patches=plt.hist([[len(l) for l in xlist],[len(l) for l in  ylist]])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    for patch0,patch1 in zip(patches[0].patches,patches[1].patches):
        patch0.set_hatch('\\')
        patch1.set_hatch('/')
    plt.legend(legend)
    
    

In [14]:
show_list_len_pair_hist(['source','target'],'# tokens per sequcen','count',source,target)

In [15]:
sorted(source,reverse=True)[:2]

[['zurich',
  'is',
  'considered',
  'to',
  'be',
  'a',
  'major',
  'financial',
  'hub',
  '.'],
 ['zoos', 'are', 'like', 'prisons', 'for', 'animals', '.']]

In [16]:
a=[]
for i in source:
    a.append(len(i))

In [17]:
sorted(a,reverse=True)[:2]

[51, 48]

In [1]:
class Vocab:
    def __init__(self,tokens=None,min_freq=0,reserved_tokens=None):
        if tokens is None:
            tokens=[]
        if reserved_tokens is None:
            reserved_tokens=[]
        counter=count_corpus(tokens)
        self._token_freqs=sorted(counter.items(),key=lambda x:x[1],reverse=True)
        self.idx_to_token=['<unk>']+reserved_tokens
        self.token_to_idx={token:idx for idx,token in enumerate(self.idx_to_token)}
        self.idx_to_token,self.token_to_idx=[],dict()
        for token,freq in self._token_freqs:
            if freq<min_freq:
                break;
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token]=len(self.idx_to_token)-1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
    @property
    def unk(self):
        return 0;
    
    @property
    def token_freqs(self):
        return self._token_freqs;
    

In [2]:
import collections

In [3]:
def count_corpus(tokens):
    if len(tokens)==0 or isinstance(tokens[0],list):
        tokens=[token for line in tokens for token in line]
    return collections.Counter(tokens)

In [21]:
src_vocab=Vocab(source,min_freq=2,reserved_tokens=['<pad>', '<bos>', '<eos>'])

In [22]:
len(src_vocab)

10008

In [23]:
def truncate_pad(line,num_steps,padding_token):
    if len(line)>num_steps:
        return line[:num_steps]
    return line + [padding_token]*(num_steps-len(line))


In [24]:
truncate_pad(src_vocab[source[0]],10,src_vocab['<pad>'])

[43, 0, 0, 0, 0, 0, 0, 0, 0, 0]