In [19]:
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import IMDB
import spacy
import torch
import torchtext
import torch.utils.data as Data
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [20]:
train_iter = IMDB(split='train')

In [21]:
spacy_en = spacy.load('en_core_web_sm')


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield [tok.text for tok in spacy_en.tokenizer(text)]  # 分词


vocab = build_vocab_from_iterator(yield_tokens(train_iter))  # Build a Vocab from an iterator.

In [22]:
vocab.insert_token("<unk>", 0)
vocab.insert_token("<pad>", 1)
vocab.insert_token("<SOS>", 2)
vocab.insert_token("<EOS>", 3)
vocab.set_default_index(0)

In [23]:
# 预训练词向量
vec1 = torchtext.vocab.Vectors(name="glove.6B.200d.txt",
                               max_vectors=25000,
                               cache=r'C:\Users\duanm\Music\GitHubProjects\MLNote\E_PyTorch\高阶操作及深度学习相关理论\torchtext自然语言处理\.vector_cache')

print(vec1.vectors.shape)
print(vec1.vectors)

torch.Size([25000, 200])
tensor([[-0.0715,  0.0935,  0.0237,  ...,  0.3362,  0.0306,  0.2558],
        [ 0.1765,  0.2921, -0.0021,  ..., -0.2077, -0.2319, -0.1081],
        [ 0.1229,  0.5804, -0.0696,  ..., -0.0392, -0.1624, -0.0967],
        ...,
        [-0.0020,  0.0202, -0.0244,  ...,  0.0142, -0.8224, -0.3703],
        [ 0.1291, -0.2605,  0.0139,  ...,  0.1384, -0.0146,  0.4337],
        [-0.7300,  0.5164, -0.5798,  ...,  0.3581,  1.1576,  0.2573]])


In [24]:
pretrained_embeddings = vec1.get_vecs_by_tokens(vocab.get_itos())

print(pretrained_embeddings.shape)
print(pretrained_embeddings)

torch.Size([121068, 200])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [25]:
def to_map_style_dataset(iter_data):
    r"""Convert iterable-style dataset to map-style dataset.
    """

    class _MapStyleDataset(Data.Dataset):

        def __init__(self, iter_data):
            # TODO Avoid list issue #1296
            self._data = list(iter_data)

        def __len__(self):
            return len(self._data)

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(iter_data)

In [None]:
train_iter, test_iter = IMDB(split=('train', 'test'))

train_data = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

In [27]:
num_train = int(len(train_data) * 0.7)
train_dataset, valid_dataset = random_split(train_data,
                                            [num_train, len(train_data) - num_train])  # 划分数据集

In [28]:
text_transform = lambda x: [vocab['<SOS>']] + [vocab[token] for token in
                                               [tok.text for tok in spacy_en.tokenizer(x)]] + [vocab['<EOS>']]
label_transform = lambda x: 1.0 if x == 'pos' else 0.0

In [29]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        lengths.append(len(processed_text))
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, batch_first=False, padding_value=0), torch.tensor(lengths)


train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True,
                              collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch)

In [30]:
vocal_size, embedding_size = pretrained_embeddings.shape
hidden_size = 256
dropout = 0.5
bidirectional = True
out_size = 1
num_layers = 2
lr = 0.001  # 学习率
weight_decay = 1e-5

In [31]:
from BRNNModel import BRNN
% run BRNNModel.py

net = BRNN(vocal_size=vocal_size,
           embedding_size=embedding_size,
           hidden_size=hidden_size,
           num_layers=num_layers,
           dropout=dropout,
           bidirectional=True,
           out_size=out_size)

In [32]:
# 使用预训练词向量
net.embed.weight.data.copy_(pretrained_embeddings)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()  # 二分类的损失函数

In [34]:
from train_evaluate_change import Train_Evaluate
% run train_evaluate_change.py

t_and_v = Train_Evaluate(net, optimizer, criterion, 5, device=device)

In [35]:
def accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).to(torch.float32)
    acc = correct.sum() / len(correct)
    return acc

In [37]:
history = t_and_v.train_eval(train_dataloader, valid_dataloader, verbose=50, metric=accuracy)
history

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


{'train_loss': [0.41986435651779175,
  0.2845507860183716,
  0.25453224778175354,
  0.11173132807016373,
  0.0711987167596817],
 'val_loss': [0.45011159777641296,
  0.36576783657073975,
  0.390453964471817,
  0.3022767901420593,
  0.4003587067127228],
 'train_accuracy': [0.8184571266174316,
  0.8875428438186646,
  0.904285728931427,
  0.968742847442627,
  0.9800000190734863],
 'val_accuracy': [0.8044000267982483,
  0.8442666530609131,
  0.8447999954223633,
  0.8902666568756104,
  0.8713333606719971]}

In [84]:
def predict_sentiment(model, sentence):
    """预测句子的评价"""
    model.eval()
    processed_text = torch.tensor(text_transform(sentence)).to(device)
    processed_text = processed_text.unsqueeze(1)
    length = [len(processed_text)]
    prediction = torch.sigmoid(model(processed_text, length))
    return prediction.item()

In [85]:
predict_sentiment(net, "fuck, garbage")

0.048988379538059235

In [86]:
predict_sentiment(net, "This film is terrible")  # 越接近与0,越能代表为负面评价

0.14703738689422607

In [90]:
predict_sentiment(net, "This film is great")  # 越接近与1,越能代表为正面评价


0.9958744645118713