<a href="https://colab.research.google.com/github/chongzicbo/nlp-ml-dl-notes/blob/master/pytorch_tutorials/pytorch_03%EF%BC%9A%E4%BD%BF%E7%94%A8nn_Transformer%E5%92%8CTorchText%E8%BF%9B%E8%A1%8CSeq2Seq%E5%BB%BA%E6%A8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder,TransformerEncoderLayer

In [0]:
class TransformerModel(nn.Module):
  def __init__(self,n_token,ninp,nhead,nhid,nlayers,dropout=0.5):
    super(TransformerModel,self).__init__()
    self.model_type='Transformer'
    self.src_mask=None
    self.pos_encoder=PositionalEncoding(ninp,dropout)
    encoder_layers=TransformerEncoderLayer(d_model=ninp,nhead=nhead,dim_feedforward=nhid,dropout=dropout)
    self.transformer_encoder=TransformerEncoder(encoder_layer=encoder_layers,num_layers=nlayers)
    self.encoder=nn.Embedding(n_token,ninp)
    self.ninp=ninp
    self.decoder=nn.Linear(ninp,n_token)

    self.init_weights()

  def _generate_square_subsequent_mask(self,sz):
    mask=(torch.triu(torch.ones(sz,sz))==1).transpose(0,1) #torch.triu:返回矩阵上三角;transpose以后返回矩阵下三角
    mask=mask.float().masked_fill(mask==0,float('-inf')).masked_fill(mask==1,float(0.0)) #现在等于0的位置填充-inf,然后在等于1的位置填充0
    return mask

  def init_weights(self):
    initrange=0.1
    self.encoder.weight.data.uniform_(-initrange,initrange)
    self.decoder.bias.data.zero_()
    self.decoder.weight.data.uniform_(-initrange,initrange)

  def forward(self,src):
    if self.src_mask is None or self.src_mask.size(0)!=len(src):
      device=src.device
      mask=self._generate_square_subsequent_mask(len(src)).to(device)
      self.src_mask=mask

    src=self.encoder(src)*math.sqrt(self.ninp) ##进行word Embedding
    src=self.pos_encoder(src) #word Embdeeing + positional_embedding
    output=self.transformer_encoder(src,self.src_mask)#src shape位[序列长度,batch_size,embedding向量维度]
    output=self.decoder(output) #全连接层
    return output

In [0]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model,dropout=0.1,max_len=5000):
    super(PositionalEncoding,self).__init__()
    self.dropout=nn.Dropout(p=dropout)
    pe=torch.zeros(max_len,d_model)
    position=torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
    div_term=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
    pe[:,0::2]=torch.sin(position*div_term)
    pe[:,1::2]=torch.cos(position*div_term)
    pe=pe.unsqueeze(0).transpose(0,1)
    self.register_buffer('pe',pe)

  def forward(self,x):
    x=x+self.pe[:x.size(0),:]
    return self.dropout(x)

In [5]:
mask=(torch.triu(torch.ones(3,3))==1).transpose(0,1)
mask=mask.float().masked_fill(mask==0,float('-inf')).masked_fill(mask==1,float(0.0))
mask

tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])

In [10]:
a=torch.arange(0,5000,dtype=torch.float).unsqueeze(1)
a.shape

torch.Size([5000, 1])

In [11]:
d_model=2048
b=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
b.shape

torch.Size([1024])

In [0]:
import torchtext
from torchtext.data.utils import get_tokenizer

In [7]:
!pip install torchtext==0.5

Collecting torchtext==0.5
[?25l  Downloading https://files.pythonhosted.org/packages/79/ef/54b8da26f37787f5c670ae2199329e7dccf195c060b25628d99e587dac51/torchtext-0.5.0-py3-none-any.whl (73kB)
[K     |████▌                           | 10kB 29.2MB/s eta 0:00:01[K     |█████████                       | 20kB 6.3MB/s eta 0:00:01[K     |█████████████▍                  | 30kB 7.6MB/s eta 0:00:01[K     |██████████████████              | 40kB 5.9MB/s eta 0:00:01[K     |██████████████████████▍         | 51kB 6.3MB/s eta 0:00:01[K     |██████████████████████████▉     | 61kB 7.4MB/s eta 0:00:01[K     |███████████████████████████████▍| 71kB 8.4MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.1MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |▎                               | 10kB 36.

In [0]:
TEXT=torchtext.data.Field(tokenize=get_tokenizer('basic_english'),init_token='<sos>',eos_token='<eos>',lower=True)
train_txt,val_txt,test_txt=torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)
device=torch.device("cuda" if torch.cuda.is_available() else 'cpu')

def batchify(data,bsz):
  data=TEXT.numericalize([data.examples[0].text])
  nbatch=data.size(0)//bsz
  data=data.narrow(0,0,nbatch*bsz)
  data=data.view(bsz,-1).t().contiguous()
  return data.to(device)



In [13]:
TEXT.numericalize([train_txt.examples[0].text])

tensor([[   3],
        [  12],
        [3852],
        ...,
        [   6],
        [   3],
        [   3]])

In [0]:
batch_size=20
eval_batch_size=10
train_data=batchify(train_txt,batch_size)
val_data=batchify(val_txt,eval_batch_size)
test_data=batchify(test_txt,eval_batch_size)

In [16]:
train_data.shape,val_data.shape

(torch.Size([104335, 20]), torch.Size([21817, 10]))

In [0]:
bptt=35
def get_batch(source,i):
  seq_len=min(bptt,len(source)-1-i)
  data=source[i:i+seq_len]
  target=source[i+1:i+1+seq_len].view(-1)
  return data,target

In [0]:
ntokens=len(TEXT.vocab.stoi)
emsize=200
nhid=200
nlayers=2
nhead=2
dropout=0.2
model=TransformerModel(n_token=ntokens,ninp=emsize,nhead=nhead,nhid=nhid,nlayers=nlayers,dropout=dropout).to(device)

In [0]:
data,targets=get_batch(train_data,0)

In [30]:
data.shape,targets.shape

(torch.Size([35, 20]), torch.Size([700]))

In [0]:
criterion=nn.CrossEntropyLoss()
lr=5.0
optimizer=torch.optim.SGD(model.parameters(),lr=lr)
scheduler=torch.optim.lr_scheduler.StepLR(optimizer,1.0,gamma=0.95)

import time
def train():
  model.train()
  total_loss=0
  start_time=time.time()
  ntokens=len(TEXT.vocab.stoi)
  for batch,i in enumerate(range(0,train_data.size(0)-1,bptt)):
    data,targets=get_batch(train_data,i)
    optimizer.zero_grad()
    output=model(data)
    # print(output.shape)
    loss=criterion(output.view(-1,ntokens),targets)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(),0.5)
    optimizer.step()

    total_loss+=loss.item()
    log_interval=200
    log_interval = 200
    if batch % log_interval == 0 and batch > 0:
      cur_loss = total_loss / log_interval
      elapsed = time.time() - start_time
      print('| epoch {:3d} | {:5d}/{:5d} batches | '
            'lr {:02.2f} | ms/batch {:5.2f} | '
            'loss {:5.2f} | ppl {:8.2f}'.format(
              epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
              elapsed * 1000 / log_interval,
              cur_loss, math.exp(cur_loss)))
      total_loss = 0
      start_time = time.time()


In [0]:
def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [0]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |   200/ 2981 batches | lr 5.00 | ms/batch 18.84 | loss  5.15 | ppl   172.14
| epoch   1 |   400/ 2981 batches | lr 5.00 | ms/batch 18.56 | loss  5.15 | ppl   172.61
| epoch   1 |   600/ 2981 batches | lr 5.00 | ms/batch 18.92 | loss  4.95 | ppl   141.86
| epoch   1 |   800/ 2981 batches | lr 5.00 | ms/batch 18.63 | loss  5.13 | ppl   168.30
| epoch   1 |  1000/ 2981 batches | lr 5.00 | ms/batch 19.11 | loss  5.18 | ppl   177.67
| epoch   1 |  1200/ 2981 batches | lr 5.00 | ms/batch 19.24 | loss  5.34 | ppl   209.16
| epoch   1 |  1400/ 2981 batches | lr 5.00 | ms/batch 19.32 | loss  5.38 | ppl   218.02
| epoch   1 |  1600/ 2981 batches | lr 5.00 | ms/batch 19.30 | loss  5.43 | ppl   227.32
| epoch   1 |  1800/ 2981 batches | lr 5.00 | ms/batch 19.16 | loss  5.36 | ppl   213.04
| epoch   1 |  2000/ 2981 batches | lr 5.00 | ms/batch 18.80 | loss  5.39 | ppl   219.57
| epoch   1 |  2200/ 2981 batches | lr 5.00 | ms/batch 19.02 | loss  5.27 | ppl   194.07
| epoch   1 |  2400/ 

In [28]:
model.parameters

<bound method Module.parameters of TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Line