In [None]:
!pip install zhon



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time
import string
import zhon

In [None]:
from zhon.hanzi import punctuation

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 实验各种东西
a = torch.arange(24).view(2, 3, 4)
b = a.transpose(0, 1)
c = torch.arange(12).view(3, 4)
embf = nn.Embedding(12, 10)
d = embf(c)
d1 = d.transpose(0, 1)
# print(d.shape, d1.shape)

gru = nn.GRU(10, 7, bidirectional = True)
x, y = gru(d)
print(x.shape, y.shape)

x = c.unsqueeze(0)
y = c.unsqueeze(1)
z = c.unsqueeze(2)
print(x.shape, y.shape, z.shape)

torch.Size([3, 4, 14]) torch.Size([2, 4, 7])
torch.Size([1, 3, 4]) torch.Size([3, 1, 4]) torch.Size([3, 4, 1])


In [None]:
with open('train.txt', 'r') as file:
  train = file.readlines()
with open('test.txt', 'r') as file:
  test = file.readlines()
with open('dev.txt', 'r') as file:
  dev = file.readlines()

In [None]:


# 主要处理英文部分，1.有的单词标点和单词相连接，进行分离处理，2.字母小写
# 分离句子d中的所有单词和标点
def splitw(d):
  
  res = []
  for w in d:
    if w in string.punctuation or w == '...' or w == '..':
      res.append(w[0])
      continue
    p = []
    while w and w[0] in string.punctuation:
      res.append(w[0])
      w = w[1:]
    while w and w[-1] in string.punctuation:
      p.append(w[-1])
      w = w[:-1]
    if w:
      res.append(w)
    res.extend(p)

  return res
def datapre(data, maxen=50, rp=True):
  data1 = [d[:-1] for d in data]
  data2 = [d.split() for d in data1]
  datazh = [d for i, d in enumerate(data2) if i % 2 == 0]
  dataen = [d for i, d in enumerate(data2) if i % 2 == 1]
  dataen1 = [splitw(d) for d in dataen] 
  dataen2 = [[w.lower() for w in d] for d in dataen1]

  # 默认去掉所有标点，否则标点符号统一化
  if rp:
    dataen3 = [[w for w in d if w not in string.punctuation] for d in dataen2]
    datazh1 = [[w for w in d if w not in punctuation] for d in datazh]
  else:
    dataen3 = [["." if w in string.punctuation else w for w in d] for d in dataen2]
    datazh1 = [["." if w in punctuation else w for w in d] for d in datazh]

  # 利用padding填充数据，有一些超长的句子就省略了
  # maxen = 50
  idx = [i for i, d in enumerate(dataen3) if len(d)<=50]
  dataen4 = [dataen3[i] for i in idx]
  datazh2 = [datazh1[i] for i in idx]
  maxzh = max(len(d) for d in datazh2)
  enpad = [d + ['<padding>'] * (maxen - len(d)) for d in dataen4]
  zhpad = [d + ['<padding>'] * (maxzh - len(d)) for d in datazh2]

  # 为句子补上起始符和终止符
  enfinal = [['<SOS>'] + d + ['<EOS>'] for d in enpad]
  zhfinal = [['<SOS>'] + d + ['<EOS>'] for d in zhpad]

  return enfinal, zhfinal


In [None]:
# 将词汇数据处理为id数据
tr_en, tr_zh = datapre(train)
ts_en, ts_zh = datapre(test)
enlist, zhlist = [], []
for l in tr_en + ts_en:
  enlist += l
for l in tr_zh + ts_zh:
  zhlist += l 
en_vocab_size = len(set(enlist))
zh_vocab_size = len(set(zhlist))
en_id2char = list(set(enlist))
zh_id2char = list(set(zhlist))
en_char2id = {c:i for i,c in enumerate(en_id2char)}
zh_char2id = {c:i for i,c in enumerate(zh_id2char)}

trid_en, tsid_en =[[en_char2id[c] for c in d] for d in tr_en], [[en_char2id[c] for c in d] for d in ts_en]
trid_zh, tsid_zh =[[zh_char2id[c] for c in d] for d in tr_zh], [[zh_char2id[c] for c in d] for d in ts_zh]

trid_en, tsid_en = torch.tensor(trid_en, device=device), torch.tensor(tsid_en, device=device)
trid_zh, tsid_zh = torch.tensor(trid_zh, device=device), torch.tensor(tsid_zh, device=device)

In [None]:
# 编码器
# 之前自己实现的sq2sq在如何把inputs处理为(num_steps, batch_size, n_class)时是直接用独热函数将(batch_size, num_steps)转换
# 但其实用transpose(0, 1)更加优雅方便

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
    super().__init__()
    # input_dim可以理解为词汇表长度
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.gru = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
    self.dense = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
    self.dropout = nn.Dropout(dropout)
  def forward(self, inputs):
    # inputs的形状：(batch_size, num_steps)
    # embed的形状：(num_steps, batch_size, emb_dim)
    embed = self.dropout(self.embedding(inputs)).transpose(0, 1)
    # state.shape[0] = 1*2，代表单层双向最终输出的隐状态。state.shape[1] = batchsize, state.shape[2] = enc_hid_dim
    outputs, state = self.gru(embed)
    # 最终将双层state横向堆叠后输入全输出层来调整形状，不要忘记用tanh归一最后结果
    final_state = torch.tanh(self.dense(torch.cat((state[0, :, :], state[1, :, :]), dim=1)))
    return outputs, final_state

  

In [None]:
# 测试一下encoder
print(c.shape)
c = c.long()
testenc = Encoder(12, 7, 11, 17, 0.2)
x, y = testenc(c)
print(x.shape, y.shape)

torch.Size([3, 4])
torch.Size([4, 3, 22]) torch.Size([3, 17])


In [None]:
# attention机制
# 现在已经有编码器输出enc_outputs(num_steps, batch_size, enc_hid_dim * 2),
# 以及decoder的单层双向隐状态state(batch_size, dec_hid_dim)。现在我们要考虑如何利用他们设计注意力机制
# 现在已知s(t-1)，我们要求s(t-1)与enc_outputs中的每个时间步h(j)的关联性atj，进而决定c(t)=sum(att(s(t-1),h(j)))
# 这里采用加和的形式来计算这种关联性，其实我更倾向用加权点乘，不过这样写也可以熟悉一下pytorch的语法比如unsequeeze

class Attention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
    super().__init__()
    # 不能一步到位，要考虑vt，但我觉得vt只是让注意力机制的计算更加复杂了一点，直接一步到位也可以
    self.attn = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim, bias=False)
    self.v = nn.Linear(dec_hid_dim, 1, bias=False)
  def forward(self, s, enc_outputs):
    # s是解码器单步隐状态s(t-1)，enc_outputs是编码器所有的隐状态
    num_steps = enc_outputs.shape[0]
    batch_size = enc_outputs.shape[1]
    # 把s从(batch_size,dec_hid_dim)扩展成(batch_size,num_steps,dec_hid_dim)，方便和编码器的每个隐状态做计算
    srepeat = s.unsqueeze(1).repeat(1, num_steps, 1)
    # 转置encoutputs与srepeat匹配
    enco_t = enc_outputs.transpose(0, 1)
    # 计算e(tj)，注意用tanh激活函数进行归一
    et = torch.tanh(self.attn(torch.cat((enco_t, srepeat), dim=2)))
    # self.v(et)的形状为(batch_size, num_steps, 1)，需要将最后一维删去
    attention = self.v(et).squeeze(2)
    return nn.functional.softmax(attention, dim=1)


In [None]:
# 测试一下attention
print(c.shape)
c = c.long()
testenc = Encoder(12, 7, 11, 17, 0.2)
enc_outputs, es = testenc(c)
# 用es模拟一下解码器s，维度都一样
testatt = Attention(11, 17)
x = testatt(es, enc_outputs)
print(x.shape)

torch.Size([3, 4])
torch.Size([3, 4])


In [None]:
# 解码器decoder
# 解码器和编码器结构有很大的不同，最主要的是不能将全部时间步num_steps喂给解码器，每个时间步的输入依赖上个时间步输出的隐状态
# 上个时间步的隐状态和encoder进行注意力处理后与decoder的上一个单词embedding合并，作为当前时间步的输入
# 每个时间步最终的输出为gru的输出，单词embedding以及注意力隐状态合并后采用全连接层处理

class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
    super().__init__()
    self.output_dim = output_dim
    self.embedding = nn.Embedding(output_dim, emb_dim)
    # enc_hid_dim*2代表的是注意力机制处理后的上文信息c
    self.gru = nn.GRU(enc_hid_dim * 2 + emb_dim, dec_hid_dim)
    self.attention = attention
    self.dense = nn.Linear(enc_hid_dim * 2 + dec_hid_dim + emb_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
  def forward(self, dec_input, s, enc_outputs):
    # dec_input:(batch_size), s:(batch_size, dec_hid_dim), enc_outputs:(num_steps, batch_size, enc_hid_dim*2)
    # 先将enc_outputs通过attention机制处理为上文状态，为了相乘方便，对a进行扩维处理unsqueeze，对enc_outputs转置
    a = self.attention(s, enc_outputs).unsqueeze(1)
    enco_t = enc_outputs.transpose(0, 1)
    # c:(1, batch_size enc_hid_dim*2)，处理成该形状是为了符合gru输入形状
    c = torch.bmm(a, enco_t).transpose(0, 1)
    # 将dec_input进行embedding处理，最终形状(1, batch_size, emb_dim)也是为了满足gru输入
    dec_input = dec_input.unsqueeze(1)
    embed = self.dropout(self.embedding(dec_input)).transpose(0, 1)
    # gru的输入为当前步dec_input的embedding和上文信息c的合并
    gru_input = torch.cat((c, embed), dim=2)

    gru_output, gru_hid = self.gru(gru_input, s.unsqueeze(0))

    # 将c, gru_output, embed合并，通过全连接层实现最后的输出
    c = c.squeeze(0)
    gru_output = gru_output.squeeze(0)
    embed = embed.squeeze(0)
    output = self.dense(torch.cat((c, gru_output, embed), dim=1))


    # gru_hid调整形状后作为下一个时间步的输入s
    return output, gru_hid.squeeze(0)



In [None]:
# 测试一下decoder
print(c.shape)
c = c.long()
testenc = Encoder(12, 7, 11, 17, 0.2)
enc_outputs, es = testenc(c)
# 用es模拟一下解码器s，维度都一样
testatt = Attention(11, 17)
testdec = Decoder(19, 7, 11, 17, 0.2, testatt)
# batch_size:c.shape[0]
dec_input = torch.arange(c.shape[0])
dec_output, ds = testdec(dec_input, es, enc_outputs)
print(dec_output.shape, ds.shape)

torch.Size([3, 4])
torch.Size([3, 19]) torch.Size([3, 17])


In [None]:
# sq2sq模型
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, data_en, data_zh, teacher_rate=0.5):
    batch_size = data_en.shape[0]
    num_steps = data_zh.shape[1]
    # 这里没有将最开始的起始符放入outputs，所以最后的输出和中文字符全长105相比少1
    outputs = []

    enc_outputs, s = self.encoder(data_en)
    dec_input = data_zh[:,0]

    for i in range(1, num_steps):
      dec_output, s = self.decoder(dec_input, s, enc_outputs)
      dec_input = data_zh[:,i] if random.random() < teacher_rate else dec_output.argmax(1)
      outputs.append(dec_output)
    return torch.stack(outputs)


In [None]:
# 测试一下seq2seq
x, y = trid_en[:6], trid_zh[:6]
emb_dim = 256
enc_hid_dim = 512
dec_hid_dim = 512
dropout = 0.5
attention = Attention(enc_hid_dim, dec_hid_dim)
encoder = Encoder(en_vocab_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout).to(device)
decoder = Decoder(zh_vocab_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention).to(device)
model = Seq2Seq(encoder, decoder).to(device)
z = model(x,y)
print(z.shape, y.shape)

torch.Size([87, 6, 16136]) torch.Size([6, 88])


In [None]:
print(zh_vocab_size, en_vocab_size)

16136 11356


In [None]:
# 将预测结果以好观察的形式呈现
def sentence(lis):
  j = len(lis) - 1
  while lis[j] == '<padding>' or lis[j] == '<EOS>':
    j -= 1
  return ''.join(lis[:j+1])
def translate(model, tri, tsi):
  tren,tsen = trid_en[tri].unsqueeze(0), tsid_en[tsi].unsqueeze(0)
  trzh, tszh = trid_zh[tri].unsqueeze(0), tsid_zh[tsi].unsqueeze(0)
  tro, tso = model(tren, trzh, 0), model(tsen, tsen, 0)
  tro1, tso1 = tro.squeeze(1).argmax(1), tso.squeeze(1).argmax(1)
  tro2, tso2 = [zh_id2char[i] for i in tro1], [zh_id2char[i] for i in tso1]
  print('train sentence:', sentence(tro2))
  print('test sentence:', sentence(tso2))


In [None]:
# 开始训练
num_epochs = 20
batch_size = 128
loss = nn.CrossEntropyLoss()

# 注，这里emb_dim编码器解码器共用
emb_dim = 256
enc_hid_dim = 512
dec_hid_dim = 512
dropout = 0.5
attention = Attention(enc_hid_dim, dec_hid_dim)
encoder = Encoder(en_vocab_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout).to(device)
decoder = Decoder(zh_vocab_size, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention).to(device)
model = Seq2Seq(encoder, decoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
tri, tsi = 20, 20
print('correct train',sentence(tr_zh[tri]))
print('correct test', sentence(ts_zh[tsi]))
for epoch in range(num_epochs):
  start, loss_sum, n = time.time(), 0.0, 0
  dataset = torch.utils.data.TensorDataset(trid_en, trid_zh)
  train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
  for tren, trzh in train_iter:
    outputs = model(tren, trzh)
    ov = outputs.view(-1, outputs.shape[-1])
    # 因为outputs不包括起始字符，所以y也要去头
    y = trzh[:, 1:]
    yt = y.transpose(0, 1).contiguous().view(-1)
    l = loss(ov, yt.long())
    # 训练三部曲
    optimizer.zero_grad()
    l.backward()
    optimizer.step()

    loss_sum += l.item() * trzh.shape[0]
    n += trzh.shape[0]
  try:
    perplexity = math.exp(loss_sum / n)
  except OverflowError:
    perplexity = float('inf')
  print("epoch:{} perplexity:{} time:{}".format(epoch, perplexity, time.time() - start))
  translate(model, tri, tsi)



correct train <SOS>高雄市有一间音乐pub发生了枪击案5名男子和被害人有恩怨今天凌晨进入这间pub之后开枪示警嫌犯还拿起了酒罐和椅子殴打了被害人之后离去
correct test <SOS>香港足球总会主席康保局介绍说这两名球员分别是快译通队的后卫冯希志和门将肖国基
epoch:0 perplexity:14.675365485626488 time:62.239134550094604
train sentence: 他的在的的的的的的的的的的的
test sentence: 的的的的的的的的
epoch:1 perplexity:8.177351049292026 time:61.207768201828
train sentence: 新华社的的的的的的的的的的的的的的的的的
test sentence: 在是的的的的的的的的
epoch:2 perplexity:7.509777240235295 time:61.540637254714966
train sentence: 新华社的的在的的的的的的的的的的的的的的的的的的的
test sentence: 在在的的的的的的的的的的的的的
epoch:3 perplexity:7.0865406215482345 time:61.851070165634155
train sentence: 新华社北京１０月的的的的的的的的的的的的的的的的的的的
test sentence: 在的的的的的的的的的的的的
epoch:4 perplexity:6.7612953763845915 time:61.857831716537476
train sentence: 新华社在在和中国的的的的的和的的和和的的的的的的和的的的
test sentence: 在的的的的的的的的的的的
epoch:5 perplexity:6.283706426377945 time:61.818536043167114
train sentence: 新华社北京１１月电记者马晓霖的和和和和和和和和和和和和和和和和和和和和和和和和和和和和和
test sentence: 在的的的的的的的和和中国的的
epoch:6 perplexity:5.87985388254682 time:61.821486949920654
tra

In [None]:
def random_translate(model, n):
  for i in range(n):
    x = random.randint(0, len(ts_en) - 1)
    print('correct train:',sentence(tr_zh[x]))
    print('correct test:', sentence(ts_zh[x]))
    translate(model, x, x)
    print('\n')
random_translate(model, 5)

# 可以看到，train的拟合效果勉强还行，test的效果就比较差了

correct train: <SOS>科什图尼察对美国之音的塞尔维亚语部表示反对派同所有的有关的国家机构建立了联系包括警方一个新的民主政府正在形成
correct test: <SOS>著名学者戴逸先生说外国传教士在鸦片战争中跟着侵略军大批的涌进中国来其外国的侵略和传教事业从一开始就密切的联系在一起
train sentence: 科什图尼察表示最近的塞尔维亚语塞尔维亚语塞尔维亚语的塞尔维亚语塞尔维亚语的反对派的一个新政府正在形成的一个新政府正在形成
test sentence: 有表示这名的的的的在在美国和的的名巴以巴以巴以冲突中的的近4000的近


correct train: <SOS>新公司的股票在昨天早盘稍微下跌到18.31英镑之后逐渐回升到发售价18.43英镑
correct test: <SOS>而台北市警方上午就进行了一场防抢演习强势的警力希望枭枭不要轻举妄动
train sentence: 新政府的下跌的的的下跌的18.31英镑新英镑英镑新英镑
test sentence: 从海上的的的的一方并且声称的一个而的的一个小时


correct train: <SOS>死伤人员家属情绪稳定
correct test: <SOS>副总统律师团今天将前往台北地院去递状控告新新闻诽谤
train sentence: 死伤人员伤亡情绪在情绪的情绪
test sentence: 法官的将对俄对俄要求恢复的的将引发了


correct train: <SOS>结果谁也无法预料
correct test: <SOS>来自上海的黄闻涛在男子f12级三级跳远比赛中以14米16的好成绩打破残奥会记录获得金牌
train sentence: 反正没看过
test sentence: 和的的的的的的的的不足不足的的的的的的的的一批的的一批典型


correct train: <SOS>搞卫生工作量也不小那也是太太的事
correct test: <SOS>泰国警方在与逃犯谈判后允许他们带着人质乘坐1辆轻型卡车逃跑警方一直紧随其后最终顺利击毙逃犯结束了21小时的人质危机
train sentence: 而的工作量也不可太太的太太
test sentence: 从后后后在后后的后在一名男子的一名男子的一一个小时的一的一场斗争


