### 1. 引入包和依赖

In [8]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import jieba
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
# PATH = '/content/drive/My Drive/colab_envir/NLP_standard/L5/model/'
PATH = '/content/drive/My Drive/colab_envir/NLP_standard/L5/'
data_path = PATH + 'cmn.txt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
!pip install zhconv

### 2. 中文处理

In [None]:
import pandas as pd
import zhconv

PATH = '/content/drive/My Drive/colab_envir/NLP_standard/L5/'
data_path = PATH + 'cmn.txt'
df = pd.read_table(data_path, header=None)
df.columns = ['inputs', 'targets', 'comments']


# 转换成列表形式
input_texts = df.inputs.values.tolist()
target_texts = df.targets.values.tolist()
pairs = list(zip(input_texts, target_texts))
# pairs[2202]

with open(PATH+'english-simplified.txt', 'a', encoding='utf-8') as f:
  for i in pairs:
    f.write(i[0]+'\t'+zhconv.convert(i[1], 'zh-hans')+'\n')

In [9]:
SOS_token = 0
EOS_token = 1

class Lang:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0:'SOS', 1:'EOS'}
    self.n_words = 2

  def addSentence(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2count[word] += 1

def unicodeToAscii(s):
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
  )


def normalizeString_eng(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r'([.!?])', r' \1', s)
  s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
  return s
def process_chn_sentence(s):
  s = ' '.join(jieba.cut(s.split('\n')[0].replace(' ', '')))
  return s
def readLangs(lang1, lang2, reverse=False):
  print('Reading lines...')

  lines = open(PATH+'%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')

  pairs = [[normalizeString_eng(l.split('\t')[0]), process_chn_sentence(l.split('\t')[1])] for l in lines]

  if reverse:
    pairs = [list(reversed(p)) for p in pairs]
    input_lang = Lang(lang2)
    output_lang = Lang(lang1)
  else:
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

  return input_lang, output_lang, pairs

MAX_LENGTH = 20

eng_prefixes = (
    'i am', 'i m',
    'he is', 'he s',
    'she is', 'she s',
    'you are', 'you re',
    'we are', 'we re',
    'they are', 'they re'
)

def filterPair(p):
  return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH #and p[1].startswith(eng_prefixes)

def PairFilter(pairs):
  return [pair for pair in pairs if filterPair(pair)]


def prepareData(lang1, lang2, reverse=False):
  input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
  print('Read %s sentence pairs' % len(pairs))
  pairs = PairFilter(pairs)
  print('Trimmed to %s sentence pairs' % len(pairs))
  print('Counting words...')
  for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])
  print('Counted words:')
  print(input_lang.name, input_lang.n_words)
  print(output_lang.name, output_lang.n_words)
  return input_lang, output_lang, pairs



input_lang, output_lang, pairs = prepareData('english', 'simplified', False)
print(random.choice(pairs))
# process_chn_sentence('就像马克·诺弗勒早期演唱的歌曲《金钱无用》一样，绝大多数的人依然高呼赞成“金钱无用论”。')


Building prefix dict from the default dictionary ...


Reading lines...


Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.974 seconds.
Prefix dict has been built successfully.


Read 66225 sentence pairs
Trimmed to 66156 sentence pairs
Counting words...
Counted words:
english 6226
simplified 10832
['his lectures are very long .', '他 的 讲座 很长 。']


### 3. 定义EncoderRNN DecoderRNN AttentionDecoderRNN

In [10]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)

  def forward(self, input, hidden):
    embedded =self.embedding(input).view(1, 1, -1)
    output = embedded
    output, hidden = self.gru(output, hidden)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

In [11]:
class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    embedding = self.embedding(input).view(1, 1, -1)
    gru_out, gru_hidden = self.gru(F.relu(embedding), hidden)
    output = self.softmax(self.out(gru_out[0]))
    return output, gru_hidden

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

In [12]:
class AttentionDecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
    super(AttentionDecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.dropout_p = dropout_p
    self.max_length = MAX_LENGTH

    self.embedding = nn.Embedding(output_size, hidden_size)
    self.attn = nn.Linear(2 * hidden_size, max_length)
    self.attn_combine = nn.Linear(2 * hidden_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
    self.dropout = nn.Dropout(dropout_p)
    self.out = nn.Linear(hidden_size, output_size)
  
  # encoder_outputs 是输入序列进行MAX_LENGTH截取后，每一个词向量输入encoder的输出序列
  # 所以，dim=(MAX_LENGTH, hiddden_size of encoder)
  def forward(self, input, hidden, encoder_outputs):
    # input dim=(1, hidden_size of encoder)
    # hidden dim=(1, 1, hidden_size of encoder)
    embedding = self.embedding(input).view(1, 1, -1)
    embedded = self.dropout(embedding)
    # embedded dim=(1, 1, hidden_size of encoder/decoder)
    embedded_prevHidden_concat = torch.cat((embedded[0], hidden[0]), dim=1)
    # embedded_prevHidden_concat dim=(1, 2 * hidden_size of encoder)
    attn = self.attn(embedded_prevHidden_concat)
    # attn dim=(1, MAX_LENGTH)
    attn_weights = F.softmax(attn, dim=1)
    # attn_weights dim=(1, MAX_LENGTH)

    # torch.bmm(input, mat2, out=None) → Tensor
    # If input is a (b×n×m) tensor, 
    #   mat2 is a (b×m×p) tensor, 
    # out will be a (b×n×p) tensor.
    attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
    # attn_applied dim=(1, 1, hidden_size of encoder/decoder)

    embedded_attnApplied_concat = torch.cat((embedded[0], attn_applied[0]), dim=1)
    # embedded_attnApplied_concat dim=(1, 2 * hidden_size of encoder/decoder)

    embedded_attn = F.relu(self.attn_combine(embedded_attnApplied_concat))
    # embedded_attn dim=(1, hidden_size of encoder/decoder)

    gru_out, gru_hidden = self.gru(embedded_attn.unsqueeze(0), hidden)
    # gru_out dim=(1, 1, hidden_size of encoder/decoder)
    # gru_hidden dim=(1, 1, hidden_size of encoder/decoder)

    output = F.log_softmax(self.out(gru_out[0]), dim=1)
    # output dim=(1, output_size)

    return output, gru_hidden, attn_weights

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

### 4. 训练函数定义

In [15]:
def indexesFromSentence(lang, sentence):
  return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
  indexes = indexesFromSentence(lang, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
  input_tensor = tensorFromSentence(input_lang, pair[0])
  target_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
  encoder_hidden = encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)

  encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

  loss = 0

  for ei in range(input_length):
    # input_tensor[ei]  dim=(1,)
    # encoder_hidden   dim=(1, 1, hidden_size)
    encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
    # encoder_output dim=(1, 1, hidden_size)
    # encoder_hidden dim=(1, 1, hidden_size)
    encoder_outputs[ei] = encoder_output[0, 0]

  decoder_input = torch.tensor([[SOS_token]], device=device)

  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:
    for di in range(target_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      # decoder_output dim=(1, output_size)
      # decoder_hidden dim=(1, 1, hidden_size of decoder)
      # decoder_attention dim=(1, MAX_LENGTH)
      loss += criterion(decoder_output, target_tensor[di])
      decoder_input = target_tensor[di]

  else:
    for di in range(target_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach()

      loss += criterion(decoder_output, target_tensor[di])
      if decoder_input.item() == EOS_token:
        break
  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length



### 5. Helper Function

In [36]:
import time
import math


import matplotlib.pyplot as plt
# plt.switch_backend('agg')
# plt.rcParams['font.sans-serif']=['SimHei']
# plt.rcParams['axes.unicode_minus'] = False

plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签
plt.rcParams['font.serif'] = ['KaiTi']
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def asMinute(s):
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)

def timeSince(since, percent):
  now = time.time()
  s = now - since
  es = s / (percent)
  rs = es - s
  return '%s (- %s)' % (asMinute(s), asMinute(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
  start = time.time()
  plot_losses = []
  print_loss_total = 0
  plot_loss_total = 0

  encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
  training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]

  criterion = nn.NLLLoss()
  for iter in range(1, n_iters + 1):
    training_pair = training_pairs[iter - 1]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]

    loss = train(input_tensor, target_tensor, encoder,
                  decoder, encoder_optimizer, decoder_optimizer, criterion)
    print_loss_total += loss
    plot_loss_total += loss

    if iter % print_every == 0:
      print_loss_avg = print_loss_total / print_every
      print_loss_total = 0
      print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))

    if iter % plot_every == 0:
      plot_loss_avg = plot_loss_total / plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total = 0

  showPlot(plot_losses)

def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
  with torch.no_grad():
    input_tensor = tensorFromSentence(input_lang, sentence)
    input_length = input_tensor.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] += encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
        decoder_attentions[di] = decoder_attention.data
        topv, topi = decoder_output.data.topk(1)
        if topi.item() == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[topi.item()])

        decoder_input = topi.squeeze().detach()

    return decoded_words, decoder_attentions[:di + 1]

def evaluateRandomly(encoder, decoder, n=5):
  for i in range(n):
    pair = random.choice(pairs)
    print('>', pair[0])
    print('=', pair[1])
    output_words, attentions = evaluate(encoder, decoder, pair[0])
    output_sentence = ' '.join(output_words)
    print('<', output_sentence)
    print('')

### 6. 正式训练

In [None]:
# 初始化从零训练
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttentionDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

In [None]:
# 已有预训练模型的训练
encoder1.train()
attn_decoder1.train()
trainIters(encoder1, attn_decoder1, 20000, print_every=5000, learning_rate=0.0005)

In [22]:
# 模型保存
torch.save(encoder1.state_dict(), PATH+'model/encoder_140000.pth')
torch.save(attn_decoder1.state_dict(), PATH+'model/attn_decoder_140000.pth')

In [None]:
# 模型实例化
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttentionDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

# 模型加载
encoder1.load_state_dict(torch.load(PATH+'model/encoder_140000.pth'))
attn_decoder1.load_state_dict(torch.load(PATH+'model/attn_decoder_140000.pth'))

# 使用模型做测试
encoder1.eval()
attn_decoder1.eval()
evaluateRandomly(encoder1, attn_decoder1, n=10)


### 7. 用单个句子生成翻译输出并打印注意力

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    input_sentence = normalizeString_eng(input_sentence)
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)



evaluateAndShowAttention("i don t plan to stay very long.")