In [85]:
import torch.nn as nn
import torch.optim as optim
# TODO ★★★★torchtext已停止更新维护,请替换
from torchtext.vocab import build_vocab_from_iterator
# TODO ★★★★torchtext已停止更新维护,请替换
from torchtext.datasets import IMDB
# TODO ★★★★英文分词请使用nltk
import spacy
import torch
# TODO ★★★★torchtext已停止更新维护,请替换
import torchtext
import torch.utils.data as Data
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score

In [86]:
# 使用torchtext加载斯坦福⼤学的⼤型电影评论数据集
train_iter = IMDB(split='train')

In [87]:
spacy_en = spacy.load('en_core_web_sm')


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield [tok.text for tok in spacy_en.tokenizer(text)]  # 分词


vocab = build_vocab_from_iterator(yield_tokens(train_iter))  # Build a Vocab from an iterator.

In [88]:
vocab.insert_token("<unk>", 0)
vocab.insert_token("<pad>", 1)
vocab.insert_token("<SOS>", 2)
vocab.insert_token("<EOS>", 3)
vocab.set_default_index(0)

In [89]:
# 预训练词向量
vec1 = torchtext.vocab.Vectors(name="glove.6B.200d.txt",
                               max_vectors=25000,
                               cache=r'C:\Users\dcdmm\Music\GitHubProjects\MLNote\E_PyTorch\高阶操作及深度学习相关理论\torchtext自然语言处理\.vector_cache')

print(vec1.vectors.shape)
print(vec1.vectors)

torch.Size([25000, 200])
tensor([[-0.0715,  0.0935,  0.0237,  ...,  0.3362,  0.0306,  0.2558],
        [ 0.1765,  0.2921, -0.0021,  ..., -0.2077, -0.2319, -0.1081],
        [ 0.1229,  0.5804, -0.0696,  ..., -0.0392, -0.1624, -0.0967],
        ...,
        [-0.0020,  0.0202, -0.0244,  ...,  0.0142, -0.8224, -0.3703],
        [ 0.1291, -0.2605,  0.0139,  ...,  0.1384, -0.0146,  0.4337],
        [-0.7300,  0.5164, -0.5798,  ...,  0.3581,  1.1576,  0.2573]])


In [90]:
pretrained_embeddings = vec1.get_vecs_by_tokens(vocab.get_itos())

print(pretrained_embeddings.shape)
print(pretrained_embeddings)  # 模型词向量矩阵

torch.Size([121068, 200])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [91]:
def to_map_style_dataset(iter_data):
    r"""Convert iterable-style dataset to map-style dataset.
    """

    class _MapStyleDataset(Data.Dataset):

        def __init__(self, iter_data):
            self._data = list(iter_data)

        def __len__(self):
            return len(self._data)

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(iter_data)

In [92]:
train_iter, test_iter = IMDB(split=('train', 'test'))

train_data = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

In [93]:
num_train = int(len(train_data) * 0.7)
train_dataset, valid_dataset = random_split(train_data,
                                            [num_train, len(train_data) - num_train])  # 划分数据集

In [94]:
text_transform = lambda x: [vocab['<SOS>']] + [vocab[token] for token in
                                               [tok.text for tok in spacy_en.tokenizer(x)]] + [vocab['<EOS>']]
label_transform = lambda x: 1 if x == 'pos' else 0

In [95]:
def collate_batch(batch):
    """
    对文本标签和文本内容进行处理使之可以用于pack_padded_sequence操作
    Parameters
    ---------
    batch : 每个batch数据

    Returns
    -------
    label_tensor : 每个batch数据文本标签的处理输出
    text_pad : 每个batch数据文本内容的处理输出
    lengths : 每个batch数据文本内容的真实长度
    """
    label_list, text_list, lengths = [], [], []
    for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        lengths.append(len(processed_text))
        text_list.append(processed_text)
    label_tensor = torch.tensor(label_list)
    text_pad = pad_sequence(text_list, batch_first=False, padding_value=0)
    lengths = torch.tensor(lengths)  # 真实长度
    return text_pad, lengths, label_tensor


train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                              collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

for text, length, label in train_dataloader:
    print(text)
    print(length)
    print(label)
    break

tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [  642,    13,   984,  ...,    13, 30057,  1966],
        [13731,   440,  5844,  ...,   162,    14,    11],
        ...,
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0]])
tensor([130, 241, 305, 156, 262, 212, 155, 154, 123, 160, 562, 288, 299, 981,
        204, 117, 243, 216,  56, 251, 278, 270, 164, 240, 127, 151, 211, 231,
         63, 213, 169,  79])
tensor([0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
        0, 0, 0, 1, 0, 0, 0, 1])


In [96]:
vocal_size, embedding_size = pretrained_embeddings.shape
hidden_size = 256
dropout = 0.5
bidirectional = True
out_size = 2
num_layers = 2
lr = 0.001  # 学习率
weight_decay = 1e-5

In [97]:
from textrnn_model_torch import TextRNN
%run textrnn_model_torch.py

net = TextRNN(vocab_size=vocal_size,
              embedding_size=embedding_size,
              hidden_size=hidden_size,
              num_layers=num_layers,
              dropout_ratio=dropout,
              bidirectional=True,
              out_size=out_size)

In [98]:
# 使用模型预训练词向量矩阵
net.embed.weight.data.copy_(pretrained_embeddings)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [99]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

In [100]:
from train_evaluate_c import Trainer
%run train_evaluate_c.py

t_and_v = Trainer(net, optimizer, criterion, 5, device=device)

In [101]:
def compute_metrics_acc(predict_all, y_true):
    predict = predict_all.argmax(-1)
    label = y_true
    acc = accuracy_score(label, predict)
    return {"acc": acc}

In [102]:
history = t_and_v.train(train_dataloader, valid_dataloader, compute_metrics=compute_metrics_acc, verbose=50)
history

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


{'Training loss': [0.6237832903862,
  0.24200162291526794,
  0.12398403882980347,
  0.039155032485723495,
  0.024145226925611496],
 'Training acc': [0.6600571428571429,
  0.9157142857142857,
  0.9568571428571429,
  0.9895428571428572,
  0.9941714285714286],
 'Validation loss': [0.6412099599838257,
  0.33504730463027954,
  0.3482036888599396,
  0.36343148350715637,
  0.4002194106578827],
 'Validation acc': [0.6350666666666667,
  0.864,
  0.872,
  0.8876,
  0.8797333333333334]}

In [105]:
def predict_sentiment(model, sentence):
    """预测句子的评价"""
    model.eval()
    processed_text = torch.tensor(text_transform(sentence)).to(device)
    processed_text = processed_text.unsqueeze(1)
    length = [len(processed_text)]
    prediction = torch.sigmoid(model(processed_text, length))
    return prediction

In [106]:
predict_sentiment(net, "fuck, garbage")

tensor([[0.9321, 0.0544]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [107]:
predict_sentiment(net, "This film is terrible")  # 倾向于负面评价

tensor([[0.9282, 0.0610]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [108]:
predict_sentiment(net, "This film is great")  # 倾向于正面评价


tensor([[0.0341, 0.9604]], device='cuda:0', grad_fn=<SigmoidBackward0>)