In [25]:
import pandas as pd
df=pd.read_csv('/kaggle/input/iwslt2017-en-zh/test.csv')
en_list=df['en'].tolist()
zh_list=df['zh'].tolist()
en_list[:5],zh_list[:5],len(en_list)

(['Several years ago here at TED, Peter Skillman  introduced a design challenge  called the marshmallow challenge.',
  "And the idea's pretty simple:  Teams of four have to build the tallest free-standing structure  out of 20 sticks of spaghetti,  one yard of tape, one yard of string  and a marshmallow.",
  'The marshmallow has to be on top.',
  "And, though it seems really simple, it's actually pretty hard  because it forces people  to collaborate very quickly.",
  'And so, I thought this was an interesting idea,  and I incorporated it into a design workshop.'],
 ['几年前，在TED大会上， Peter Skillman 介绍了一个设计挑战 叫做“棉花糖挑战”',
  '是个非常简单的主意 要求一组四人的团队搭建一个独立的最高建筑 材料是20根意大利面条 一段胶带，一段绳子 一块棉花糖',
  '棉花糖必须放在最上面',
  '这虽然看似简单，其实并不容易 因为它要求人们 迅速地合作',
  '我觉得这是个有趣的主意 我把它放到了设计专题讨论会上'],
 8549)

In [26]:
en_zh=[]
for en,zh in zip(en_list,zh_list):
    new_en=[]
    for word in en.split(' '):
        word=word.replace('.', '').replace(',','').lower()
        if word:
            new_en.append(word)
    en_zh.append((new_en,list(zh)))

print(en_zh[1])

(['and', 'the', "idea's", 'pretty', 'simple:', 'teams', 'of', 'four', 'have', 'to', 'build', 'the', 'tallest', 'free-standing', 'structure', 'out', 'of', '20', 'sticks', 'of', 'spaghetti', 'one', 'yard', 'of', 'tape', 'one', 'yard', 'of', 'string', 'and', 'a', 'marshmallow'], ['是', '个', '非', '常', '简', '单', '的', '主', '意', ' ', '要', '求', '一', '组', '四', '人', '的', '团', '队', '搭', '建', '一', '个', '独', '立', '的', '最', '高', '建', '筑', ' ', '材', '料', '是', '2', '0', '根', '意', '大', '利', '面', '条', ' ', '一', '段', '胶', '带', '，', '一', '段', '绳', '子', ' ', '一', '块', '棉', '花', '糖'])


In [27]:
en_words=set()
zh_words=set()

for s in en_zh:
    for char in s[0]:
        en_words.add(char)
    for char in s[1]:
        if char:
            zh_words.add(char)
len(zh_words),len(en_words)

(2999, 12632)

In [28]:
en_wl=['sos','eos','pad']+list(en_words)
zh_wl=['sos','eso','pad']+list(zh_words)

pad_id=2
en2id={}
zh2id={}

for i,w in enumerate(en_wl):
    en2id[w]=i
for i,w in enumerate(zh_wl):
    zh2id[w]=i

In [29]:
import random
random.shuffle(en_zh)
dl=len(en_zh)
train_set=en_zh[:int(dl*0.8)]
test_set=en_zh[int(dl*0.8):]


In [30]:
import torch
batch_size=16
data_workers=8


In [31]:
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self,examples):
        self.examples=examples
    def __len__(self):
        return len(self.examples)
    def __getitem__(self,index):
        en,zh=self.examples[index]
        ## 获取句子长度
        l1=len(en)
        l2=len(zh)
        
        return en,l1,zh,l2,index

In [32]:
## collate_fn：即用于collate的function，
## 用于对一个batch数据进行规整
def the_collate_fn(batch):
    batch_size=len(batch)
    en_list=[]
    zh_list=[]
    en_maxlen=0
    zh_maxlen=0
    for b in batch:
        en_maxlen=max(en_maxlen,b[1])
        zh_maxlen=max(zh_maxlen,b[3])
        
    
    for x in batch:
        l=[0]
        for i in range(en_maxlen):
            if i<x[1]:
                l.append(en2id[x[0][i]])
            else:
                l.append(pad_id)
        l.append(1)
        en_list.append(l)
    
    for x in batch:
        l=[0]
        for i in range(zh_maxlen):
            if i<x[3]:
                l.append(zh2id[x[2][i]])
            else:
                l.append(pad_id)
        l.append(1)
        zh_list.append(l)     
    indexs =[b[4] for b in batch]
    ## RNN将batch换到第二个维度
    en_tensor=torch.LongTensor(en_list).swapaxes(0,1)
    zh_tensor=torch.LongTensor(zh_list).swapaxes(0,1)   
    return en_tensor,zh_tensor,indexs
    

In [33]:
train_dataset=MyDataSet(train_set)
test_dataset=MyDataSet(test_set)
print(train_dataset.__getitem__(0))
train_data_loader=torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    collate_fn=the_collate_fn
)

test_data_loader=torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    collate_fn=the_collate_fn
)

(['now', 'this', 'is', 'a', 'direct', 'conflict', 'between', 'the', 'experiencing', 'self', 'and', 'the', 'remembering', 'self'], 14, ['现', '在', '这', '是', '经', '验', '自', '我', '和', '记', '忆', '自', '我'], 13, 0)


In [34]:
examples=iter(train_data_loader)
test=next(examples)
test[0].shape,test[1].shape,test[2]

(torch.Size([73, 16]),
 torch.Size([127, 16]),
 [1436,
  5060,
  4527,
  2409,
  3771,
  5536,
  4023,
  6718,
  3133,
  4111,
  24,
  4619,
  3404,
  4363,
  2427,
  4040])

In [35]:
from torch import nn
class Encoder(nn.Module):
    def __init__(self,input_dim,emb_dim,hid_dim,n_layers,dropout):
        super().__init__()
        self.hid_dim=hid_dim
        self.n_layers=n_layers
        self.embedding=nn.Embedding(input_dim,emb_dim)
        self.rnn=nn.LSTM(emb_dim,hid_dim,n_layers,dropout=dropout)
        self.dropout=nn.Dropout(dropout)
    def forward(self,en_sentence):
        ## en_sentence(sentence_len,batch_Size)
        embedded=self.dropout(self.embedding(en_sentence))
        ## embedded (len,batch_size,emb_dim)
        outputs,(hidden,cell)=self.rnn(embedded)
        ## outputs(sentence_len,batch_size,hid_dim) 最顶层RNN的隐状态
        ## hidden(n_layers,batch_size,hid_dim)
        ## cell(n_layers,batch_size,hid_dim)
        return hidden,cell

encoder=Encoder(1000,100,128,2,0.5)
test_input=torch.ones((15,8),dtype=torch.long)
hidden,cell=encoder(test_input)
print(hidden.shape)
        

torch.Size([2, 8, 128])


In [36]:
class Decoder(nn.Module):
    def __init__(self,output_dim,emb_dim,hid_dim,n_layers,dropout):
        super().__init__()
        self.output_dim=output_dim
        self.hid_dim=hid_dim
        self.n_layers=n_layers
        self.embedding=nn.Embedding(output_dim,emb_dim)
        
        self.rnn=nn.LSTM(emb_dim,hid_dim,n_layers,dropout=dropout)
        self.fc_out=nn.Linear(hid_dim,output_dim)
        self.dropout=nn.Dropout(dropout)
    def forward(self,input,hidden,cell):
        ## input(batch,) 单个时间步数据
        input=input.unsqueeze(0)
        embedded=self.dropout(self.embedding(input))
        output,(hidden,cell)=self.rnn(embedded,(hidden,cell))
        ## output(batch_size,output_dim)
        ## hidden(n_layers,batch_size,hidden_dim)
        
        prediction=self.fc_out(output.squeeze(0))
        
        return prediction,hidden,cell
    
decoder=Decoder(1000,100,128,2,0.5)
test_input=torch.ones(8,dtype=torch.long)
prediction,hidden,cell=decoder(test_input,hidden,cell)
print(prediction.shape,hidden.shape)

torch.Size([8, 1000]) torch.Size([2, 8, 128])


In [38]:
class Seq2Seq(nn.Module):
    def __init__(self,
                 input_word_count,output_word_count,encode_dim,
                 decode_dim,hidden_dim,n_layers,encode_dropout,decode_dropout,device
                ):
        super().__init__()
        self.encoder=Encoder(input_word_count,encode_dim,hidden_dim,n_layers,encode_dropout)
        self.decoder=Decoder(output_word_count,decode_dim,hidden_dim,n_layers,decode_dropout)
        self.device=device
        
    def forward(self,en,zh,teacher_forcing_ratio=1):
        if zh is not None:
            batch_size=zh.shape[1]
            zh_len=zh.shape[0]#时间步
            zh_vocab_size=self.decoder.output_dim
            ## 存放输出结果
            outputs=torch.zeros(zh_len,batch_size,zh_vocab_size).to(self.device)
            
            ## encoder输出作为decoder第一个隐藏层输入
            hidden,cell=self.encoder(en)
            input=zh[0,:]
            for t in range(1,zh_len):
                output,hidden,cell=self.decoder(input,hidden,cell)
                outputs[t]=output
                ## 以一定概率使用techer_force
                teacher_force=random.random()<teacher_forcing_ratio
                top1=output.argmax(1)
                input=zh[t] if teacher_force else top1
        else:
            batch_size=en.shape[1]
            zh_vocab_size=self.decoder.output_dim
            
            l=[]
            hidden,cell=self.encoder(en)
            input=en[0,:]
            while True:## 直到输出结果为结束符才停止
                output,hidden,cell=self.decoder(input,hidden,cell)
                l.append(output)
                top1=output.argmax(1)
                if top1==1 or len(l)>50:
                    return l
                input=top1
                
        return outputs
import numpy as np
test_input=torch.LongTensor(np.random.randint(1,1000,size=(15,1)))
seq2seq=Seq2Seq(1000,1000,100,100,128,128,0.5,0.5,test_input.device)
output_list=seq2seq(test_input,None)
len(output_list),output_list[0].shape

(51, torch.Size([1, 1000]))

In [64]:
source_word_count=len(en_wl)
target_word_count=len(zh_wl)
encode_dim=50
decode_dim=50
hidden_dim=64
n_layers=1

encode_dropout=0.5
decode_dropout=0.5
device=torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
model=Seq2Seq(
    source_word_count,target_word_count,encode_dim,decode_dim,hidden_dim,
    n_layers,encode_dropout,decode_dropout,device
).to(device)
def init_weight(m):
    if isinstance(m,nn.Linear):
            nn.init.xavier_normal_(m.weight)
            nn.init.constant_(m.bias,0)
    elif isinstance(m,nn.BatchNorm1d):
        nn.init.constant_(m.weight,1)
        nn.init.constant_(m.bias,0)
model.apply(init_weight)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(12635, 50)
    (rnn): LSTM(50, 64, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(3002, 50)
    (rnn): LSTM(50, 64, dropout=0.5)
    (fc_out): Linear(in_features=64, out_features=3002, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [65]:
from torch import optim
optimizer=optim.Adam(model.parameters(),lr=5e-4)
criterion=nn.CrossEntropyLoss(ignore_index=pad_id)


In [50]:
from tqdm import tqdm
def train(model,data_loader,optimizer,criterion,clip):
    model.train()
    epoch_loss=0
    for i,batch in enumerate(tqdm(data_loader)):
        source=batch[0].to(device)
        target=batch[1].to(device)
        optimizer.zero_grad()
        output=model(source,target)
        
        output_dim=output.shape[-1]#词表大小
        output=output[1:].view(-1,output_dim)
        target=target[1:].reshape(-1)
        loss=criterion(output,target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
        optimizer.step()
        epoch_loss+=loss.item()
    return epoch_loss/len(data_loader)

def evaluate(model,data_loader,criterion):
    model.eval()
    epoch_loss=0
    for i,batch in enumerate(data_loader):
        source=batch[0].to(device)
        target=batch[1].to(device)
        with torch.no_grad():
            output=model(source,target,0)#不用Teacher Forcing
        output_dim=output.shape[-1]
        output=output[1:].view(-1,output_dim)
        target=target[1:].reshape(-1)
        loss=criterion(output,target)
        epoch_loss+=loss.item()
    return epoch_loss/len(data_loader)

In [66]:
import math
import time

epochs=10
clip=1
best_valid_loss=float('inf')
for epoch in range(epochs):
    train_loss=train(model,train_data_loader,optimizer,criterion,clip)
    valid_loss=evaluate(model,test_data_loader,criterion)
    if valid_loss<best_valid_loss:
        best_valid_loss=valid_loss
        torch.save(model.state_dict(),'tutl-model.pt')
    print(f'Train loss: {train_loss:.3f} {math.exp(train_loss):7.3f}',)
    print(f'Valid loss: {valid_loss:.3f} {math.exp(valid_loss):7.3f}',)
    sample=translate('what is your name')
    print(f'epoch: {epoch} sample:{sample}')

100%|██████████| 428/428 [00:39<00:00, 10.71it/s]


Train loss: 6.224 504.899
Valid loss: 6.041 420.466
epoch: 0 sample: 的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的


100%|██████████| 428/428 [00:40<00:00, 10.61it/s]


Train loss: 5.820 336.815
Valid loss: 6.098 445.149
epoch: 1 sample:我们的的 我的 我的的 我的 我们的的 我的的 我的的 我的的 我的的 我的的 我的的 我的的 我的的


100%|██████████| 428/428 [00:39<00:00, 10.78it/s]


Train loss: 5.675 291.482
Valid loss: 6.185 485.522
epoch: 2 sample:我们的的。eso


100%|██████████| 428/428 [00:40<00:00, 10.59it/s]


Train loss: 5.551 257.374
Valid loss: 6.306 547.798
epoch: 3 sample:我们是我们是我们的人的。eso


100%|██████████| 428/428 [00:40<00:00, 10.67it/s]


Train loss: 5.453 233.391
Valid loss: 6.378 588.497
epoch: 4 sample:我们是我们是我们的人的。eso


100%|██████████| 428/428 [00:40<00:00, 10.64it/s]


Train loss: 5.371 215.096
Valid loss: 6.453 634.785
epoch: 5 sample:我们是我们的人的人。eso


100%|██████████| 428/428 [00:40<00:00, 10.64it/s]


Train loss: 5.295 199.420
Valid loss: 6.422 615.383
epoch: 6 sample:我们的人们是我们的人的。eso


 13%|█▎        | 55/428 [00:05<00:40,  9.23it/s]


KeyboardInterrupt: 

In [67]:
def translate(en_sentence):
    words=[]
    for word in en_sentence.strip().split(' '):
        words.append(word.replace('.','').replace(',','').lower())
    ids=[0]
    for w in words:
        ids.append(en2id[w])
    ids.append(1)
    source=torch.tensor(ids).unsqueeze(0).swapaxes(0,1).to(device)
    model.eval()
    with torch.no_grad():
        output=model(source,None,0)
    target=[]
    for x in output:
        target.append(zh_wl[x.argmax(1).cpu().item()])
    return ''.join(target)
    

In [68]:
model=Seq2Seq(
    source_word_count,target_word_count,encode_dim,decode_dim,hidden_dim,
    n_layers,encode_dropout,decode_dropout,device
).to(device)
model.load_state_dict(torch.load('tutl-model.pt'))
result=translate('what is your name')
result

' 的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的'

In [69]:
result=translate('Sequence to Sequence Learning with Neural Networks')
result

'                                                   '