In [32]:
import torch.nn as nn
import torch.optim as optim
import spacy
import torch
import torch.utils.data as Data
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score
from datasets import load_dataset
import collections
from tqdm.notebook import tqdm

In [33]:
data = load_dataset("stanfordnlp/imdb")
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [34]:
class Vocab:
    """
    Vocabulary for text
    """

    def __init__(self, tokens=None, min_freq=2, reserved_tokens=None):
        # tokens: 单词tokens
        # min_freq: The minimum frequency needed to include a token in the vocabulary.
        # reserved_tokens: 自定义tokens
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = collections.Counter(tokens)
        # Sort according to frequencies
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # The index for the unknown token is 0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)  # 未在字典中则返回'<unk>'
        return [self.__getitem__(token) for token in tokens]  # 递归

    def to_tokens(self, indices):
        """第indices位置处的token"""
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        """Index for the unknown token"""
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs


spacy_en = spacy.load('en_core_web_sm')

split_list = []
for i in data['train']:
    split_list.extend([tok.text for tok in spacy_en.tokenizer(i['text'])])

vocab = Vocab(split_list, min_freq=1, reserved_tokens=['<pad>', '<SOS>', '<EOS>'])

In [35]:
class Vectors:
    def __init__(self, name, max_vectors=None) -> None:
        self.vectors = None
        self.name = name
        self.max_vectors = max_vectors
        self.itos = None
        self.stoi = None
        self.cache()

    def cache(self):
        with open(self.name, "r", encoding='utf-8') as f:
            read_value = f.readlines()

        all_value, itos = [], []
        for i in tqdm(range(len(read_value))):
            l_split = read_value[i].split(' ')
            itos.append(l_split[0])
            all_value.append([float(i.strip()) for i in l_split[1: ]])
        all_value = torch.tensor(all_value)
        self.vectors = all_value
        self.itos = itos
        num_lines = len(self.vectors)
        if not self.max_vectors or self.max_vectors > num_lines:
            self.max_vectors = num_lines
        self.vectors = self.vectors[:self.max_vectors, :]
        self.itos = self.itos[:self.max_vectors]
        self.stoi = {word: i for i, word in enumerate(self.itos)}

    def __len__(self):
        return len(self.vectors)
    
    def __getitem__(self, token):
        if token in self.stoi:
            return self.vectors[self.stoi[token]]
        else:
            dim = self.vectors.shape[1]
            return torch.Tensor.zero_(torch.Tensor(dim))
        
    def get_vecs_by_tokens(self, tokens):
        indices = [self[token] for token in tokens]
        vecs = torch.stack(indices)
        return vecs

# 预训练词向量
vec1 = Vectors(name="glove.6B.200d.txt", max_vectors=25000)
print(vec1.vectors.shape)
print(vec1.vectors)

  0%|          | 0/400000 [00:00<?, ?it/s]

torch.Size([25000, 200])
tensor([[-0.0715,  0.0935,  0.0237,  ...,  0.3362,  0.0306,  0.2558],
        [ 0.1765,  0.2921, -0.0021,  ..., -0.2077, -0.2319, -0.1081],
        [ 0.1229,  0.5804, -0.0696,  ..., -0.0392, -0.1624, -0.0967],
        ...,
        [-0.0020,  0.0202, -0.0244,  ...,  0.0142, -0.8224, -0.3703],
        [ 0.1291, -0.2605,  0.0139,  ...,  0.1384, -0.0146,  0.4337],
        [-0.7300,  0.5164, -0.5798,  ...,  0.3581,  1.1576,  0.2573]])


In [36]:
pretrained_embeddings = vec1.get_vecs_by_tokens(vocab.idx_to_token)

print(pretrained_embeddings.shape)
print(pretrained_embeddings)  # 模型词向量矩阵

torch.Size([121069, 200])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.4552,  0.2761, -0.3108,  ...,  0.6674, -0.2191,  0.3745],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [37]:
def to_map_style_dataset(iter_data):
    r"""Convert iterable-style dataset to map-style dataset.
    """

    class _MapStyleDataset(Data.Dataset):

        def __init__(self, iter_data):
            self._data = list(iter_data)

        def __len__(self):
            return len(self._data)

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(iter_data)

In [38]:
train_iter, test_iter = data['train'], data['test'] 

train_data = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

In [39]:
num_train = int(len(train_data) * 0.7)
train_dataset, valid_dataset = random_split(train_data,
                                            [num_train, len(train_data) - num_train])  # 划分数据集

In [40]:
text_transform = lambda x: [vocab['<SOS>']] + [vocab[token] for token in
                                               [tok.text for tok in spacy_en.tokenizer(x)]] + [vocab['<EOS>']]

In [41]:
def collate_batch(batch):
    """
    对文本标签和文本内容进行处理使之可以用于pack_padded_sequence操作
    Parameters
    ---------
    batch : 每个batch数据

    Returns
    -------
    label_tensor : 每个batch数据文本标签的处理输出
    text_pad : 每个batch数据文本内容的处理输出
    lengths : 每个batch数据文本内容的真实长度
    """
    label_list, text_list, lengths = [], [], []
    for i in batch:
        _text, _label = i['text'], i['label']
        label_list.append(_label)
        processed_text = torch.tensor(text_transform(_text))
        lengths.append(len(processed_text))
        text_list.append(processed_text)
    label_tensor = torch.tensor(label_list)
    text_pad = pad_sequence(text_list, batch_first=False, padding_value=0)
    lengths = torch.tensor(lengths)  # 真实长度
    return text_pad, lengths, label_tensor


train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True,
                              collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)

for text, length, label in train_dataloader:
    print(text)
    print(length)
    print(label)
    break

tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [   27,  3707,    13,  ...,   597, 53216,   360],
        [29471,   102,   251,  ...,   102,     5,  1117],
        ...,
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0]])
tensor([130, 128, 714, 234, 148, 336, 464, 541, 325, 118,  78, 143, 298, 145,
        176, 259, 251, 184, 166, 171, 237, 151, 139, 393, 201, 308, 141, 370,
        235, 370, 387, 230, 155, 135, 162, 363, 430, 192, 161, 116, 131, 835,
        215, 480, 157, 241, 148, 231, 149, 124, 262,  77, 710, 301, 345, 474,
        248, 167, 533, 224, 180, 120,  88, 531])
tensor([0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0])


In [42]:
vocal_size, embedding_size = pretrained_embeddings.shape
hidden_size = 256
dropout = 0.5
bidirectional = True
out_size = 2
num_layers = 2
lr = 0.001  # 学习率
weight_decay = 1e-5

In [43]:
from textrnn_model_torch import TextRNN
%run textrnn_model_torch.py

net = TextRNN(vocab_size=vocal_size,
              embedding_size=embedding_size,
              hidden_size=hidden_size,
              num_layers=num_layers,
              dropout_ratio=dropout,
              bidirectional=True,
              out_size=out_size)

In [44]:
# 使用模型预训练词向量矩阵
net.embed.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.4552,  0.2761, -0.3108,  ...,  0.6674, -0.2191,  0.3745],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

In [46]:
from train_evaluate_c import Trainer
%run train_evaluate_c.py

t_and_v = Trainer(net, optimizer, criterion, 5, device=device)

In [47]:
def compute_metrics_acc(predict_all, y_true):
    predict = predict_all.argmax(-1)
    label = y_true
    acc = accuracy_score(label, predict)
    return {"acc": acc}

In [48]:
history = t_and_v.train(train_dataloader, valid_dataloader, compute_metrics=compute_metrics_acc, verbose=50)
history

----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


{'Training loss': [0.5827103853225708,
  0.33858340978622437,
  0.28198134899139404,
  0.08237747848033905,
  0.03502501919865608],
 'Training acc': [0.646,
  0.8652571428571428,
  0.9044571428571428,
  0.9729142857142857,
  0.9910857142857142],
 'Validation loss': [0.6017845869064331,
  0.4423207640647888,
  0.4647025465965271,
  0.3755393922328949,
  0.40965747833251953],
 'Validation acc': [0.6261333333333333,
  0.8298666666666666,
  0.8545333333333334,
  0.8714666666666666,
  0.8692]}

In [49]:
def predict_sentiment(model, sentence):
    """预测句子的评价"""
    model.eval()
    processed_text = torch.tensor(text_transform(sentence)).to(device)
    processed_text = processed_text.unsqueeze(1)
    length = [len(processed_text)]
    prediction = torch.sigmoid(model(processed_text, length))
    return prediction

In [50]:
predict_sentiment(net, "fuck, garbage")

tensor([[0.6829, 0.3112]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [51]:
predict_sentiment(net, "This film is terrible")  # 倾向于负面评价

tensor([[0.8031, 0.1820]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [52]:
predict_sentiment(net, "This film is great")  # 倾向于正面评价

tensor([[0.0947, 0.9147]], device='cuda:0', grad_fn=<SigmoidBackward0>)