<a href="https://colab.research.google.com/github/chongzicbo/nlp-ml-dl-notes/blob/master/pytorch_tutorials/pytorch_01%EF%BC%9A%E4%BD%BF%E7%94%A8torchtext%E8%BF%9B%E8%A1%8C%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

导入所需的库

In [0]:
# !pip install torchtext==0.5

In [0]:
import torch
import torchtext
from torchtext.datasets import text_classification
import os

In [0]:
torchtext.__version__,torch.__version__

('0.5.0', '1.4.0')

基本参数配置

In [0]:
NGRAMS=2
if not os.path.isdir('./data'):
  os.mkdir('./data')

train_dataset,test_dataset=text_classification.DATASETS['AG_NEWS'](root='./data',ngrams=NGRAMS,vocab=None) #ngrams参数设置将文本进行分割

BATCH_SIZE=16
device=torch.device('cuda' if torch.cuda.is_available() else "cpu")

120000lines [00:08, 13373.81lines/s]
120000lines [00:18, 6661.00lines/s]
7600lines [00:01, 6724.91lines/s]


定义模型网络:Embedding层+全连接层


In [0]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
  def __init__(self,vocab_size,embed_dim,num_class):
    super().__init__()
    self.embedding=nn.EmbeddingBag(vocab_size,embed_dim,sparse=True)
    self.fc=nn.Linear(embed_dim,num_class)
    self.init_weights()

  def init_weights(self):
    #初始化权重参数
    initrange=0.5
    self.embedding.weight.data.uniform_(-initrange,initrange)
    self.fc.weight.data.uniform_(-initrange,initrange)
    self.fc.bias.data.zero_()

  def forward(self,text,offsets):
    #前向传播
    embedded=self.embedding(text,offsets)
    return self.fc(embedded)

In [0]:
VOCAB_SIZE=len(train_dataset.get_vocab())
EMBED_DIM=32 #词向量维度
NUM_CLASS=len(train_dataset.get_labels())
model=TextSentiment(VOCAB_SIZE,EMBED_DIM,NUM_CLASS).to(device)

构建一个生成批量数据的函数

In [0]:
def generate_batch(batch):
  label=torch.tensor([entry[0] for entry in batch])
  text=[entry[1] for entry in batch]
  offsets=[0]+[len(entry) for entry in text]
  offsets=torch.tensor(offsets[:-1]).cumsum(dim=0) #偏移量计算
  text=torch.cat(text)
  return text,offsets,label

In [0]:
torch.Tensor([0.0,1.0,2.0,3.0,4.0,5.0][:-1]).cumsum(dim=0)

tensor([ 0.,  1.,  3.,  6., 10.])

定义模型训练和评估函数

In [0]:
from torch.utils.data import DataLoader

In [0]:
def train_func(sub_train_,optimizer,criterion):
  train_loss=0 #训练损失
  train_acc=0 #训练精度

  data=DataLoader(sub_train_,batch_size=BATCH_SIZE,shuffle=True,collate_fn=generate_batch)
  for i,(text,offsets,cls) in enumerate(data):
    optimizer.zero_grad()
    text,offsets,cls=text.to(device),offsets.to(device),cls.to(device)
    output=model(text,offsets)
    loss=criterion(output,cls) #计算损失
    train_loss+=loss.item() #训练损失叠加
    loss.backward() #反向传播
    optimizer.step() #梯度更新
    train_acc+=(output.argmax(1)==cls).sum().item()
  scheduler.step() #更新学习率

  return train_loss/len(sub_train_),train_acc/len(sub_train_)


def test(data_,criterion):
  loss=0
  acc=0
  data=DataLoader(data_,batch_size=BATCH_SIZE,collate_fn=generate_batch)
  for text,offsets,cls in data:
    text,offsets,cls=text.to(device),offsets.to(device),cls.to(device)
    with torch.no_grad():
      output=model(text,offsets)
      loss=criterion(output,cls)
      loss+=loss.item()
      acc+=(output.argmax(1)==cls).sum().item()

  return loss/len(data_),acc/len(data_)    



In [0]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS=5
min_valid_loss=float('inf')
criterion=torch.nn.CrossEntropyLoss().to(device)
optimizer=torch.optim.SGD(model.parameters(),lr=4.0)
scheduler=torch.optim.lr_scheduler.StepLR(optimizer,1,gamma=0.9)

train_len=int(len(train_dataset)*0.95)
sub_train_,sub_valid_=random_split(train_dataset,[train_len,len(train_dataset)-train_len])
for epoch in range(N_EPOCHS):
  start_time=time.time()
  train_loss,train_acc=train_func(sub_train_,optimizer=optimizer,criterion=criterion)
  valid_loss,valid_acc=test(sub_valid_,criterion)
  secs=int(time.time()-start_time)
  mins=secs/60
  secs=secs%60
  print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
  print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
  print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 27 seconds
	Loss: 0.0023(train)	|	Acc: 98.9%(train)
	Loss: 0.0000(valid)	|	Acc: 98.2%(valid)
Epoch: 2  | time in 0 minutes, 27 seconds
	Loss: 0.0017(train)	|	Acc: 99.2%(train)
	Loss: 0.0000(valid)	|	Acc: 98.7%(valid)
Epoch: 3  | time in 0 minutes, 27 seconds
	Loss: 0.0011(train)	|	Acc: 99.6%(train)
	Loss: 0.0000(valid)	|	Acc: 98.5%(valid)
Epoch: 4  | time in 0 minutes, 27 seconds
	Loss: 0.0008(train)	|	Acc: 99.7%(train)
	Loss: 0.0000(valid)	|	Acc: 98.7%(valid)
Epoch: 5  | time in 0 minutes, 27 seconds
	Loss: 0.0005(train)	|	Acc: 99.8%(train)
	Loss: 0.0000(valid)	|	Acc: 98.9%(valid)


使用测试数据集评估模型

In [0]:
print('使用测试数据集进行测试-----')
test_loss,test_acc=test(test_dataset,criterion)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

使用测试数据集进行测试-----
	Loss: 0.0004(test)	|	Acc: 89.1%(test)


In [0]:
data_loader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=generate_batch)

In [0]:
for x,y,z in data_loader:
  print(x)
  print(y)
  print(z)
  break

tensor([4088, 7956, 1290,  ...,  152,  243, 6634])
tensor([   0,   55,  132,  217,  268,  337,  442,  507,  610,  727,  810,  919,
        1020, 1079, 1176, 1259])
tensor([2, 2, 1, 3, 3, 2, 1, 3, 1, 3, 3, 2, 1, 1, 2, 3])


In [0]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text,model,vocab,ngram):
  tokenizer=get_tokenizer('basic_english')
  with torch.no_grad():
    text=torch.tensor([vocab[token] for token in ngrams_iterator(tokenizer(text),ngram)])
    output=model(text,torch.tensor([0]))
    return output.argmax(1).item()+1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])    



This is a Sports news
