In [1]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys
# sys.path.append("..") 
import dyngq_utils as dy

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import torch
torch.cuda.set_device(0)

In [2]:
DATA_ROOT = "D:/workingspace/Github/datasets"   

In [3]:
fname = os.path.join(DATA_ROOT, "aclImdb_v1.tar.gz")
if not os.path.exists(os.path.join(DATA_ROOT, "aclImdb")):
    print("从压缩包解压...")
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [4]:
from tqdm import tqdm
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def read_imdb(folder='train', data_root=DATA_ROOT+"/aclImdb"): 
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_imdb('train'), read_imdb('test')

100%|█████████████████████████████| 12500/12500 [00:00<00:00, 13171.75it/s]
100%|█████████████████████████████| 12500/12500 [00:01<00:00, 11291.79it/s]
100%|█████████████████████████████| 12500/12500 [00:01<00:00, 11150.77it/s]
100%|█████████████████████████████| 12500/12500 [00:01<00:00, 11353.32it/s]


In [5]:
# print(train_data[:10])

In [6]:
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def get_tokenized_imdb(data):
    """
    data: list of [string, label]
    """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

In [7]:
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)

('# words in vocab:', 46152)

In [8]:
print(vocab)

<torchtext.vocab.Vocab object at 0x0000019D7D41E400>


In [9]:
# 本函数已保存在d2lzh_torch包中方便以后使用
def preprocess_imdb(data, vocab):
    max_l = 500  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data]) # vocab.stoi 返回下标
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [10]:
batch_size = 64
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [11]:
for i in train_iter:
    print(i)

[tensor([[    9,   200,    10,  ...,     0,     0,     0],
        [   94, 11869,  4048,  ...,     0,     0,     0],
        [   10,     7,    32,  ...,     0,     0,     0],
        ...,
        [27304,     0,   271,  ...,     0,     0,     0],
        [   18,     2,   300,  ...,     0,     0,     0],
        [18897,     7,     3,  ...,     0,     0,     0]]), tensor([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1])]
[tensor([[3632, 3839, 5236,  ...,    0,    0,    0],
        [  52,   22,  156,  ...,    0,    0,    0],
        [  57,  534, 1169,  ...,    0,    0,    0],
        ...,
        [ 643,    0,    0,  ...,    0,    0,    0],
        [   9,   26,  102,  ...,    0,    0,    0],
        [  69, 1169,   10,  ...,    0,    0,    0]]), tensor([0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
        1,

        0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0])]
[tensor([[  631,    32,   930,  ...,     0,     0,     0],
        [    9,   200,    10,  ...,     0,     0,     0],
        [    9,   226,  1205,  ...,     0,     0,     0],
        ...,
        [   15,  1665,     0,  ...,     0,     0,     0],
        [   52,     7,     3,  ...,    29,     2, 35127],
        [   82,  1297,    10,  ...,     0,     0,     0]]), tensor([1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
        1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1])]
[tensor([[    9,   223, 19511,  ...,     0,     0,     0],
        [   10,    24,  1003,  ...,     0,     0,     0],
        [   25,    26,     6,  ...,     0,     0,     0],
        ...,
        [  160,    19,  1339,  ...,     0,     0,     0],
        [  109,    13, 29791,  ...,     0,     0,     0],
        [   46,    25,   199,  ...,     0,     0,     0]

        0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1])]
[tensor([[ 100,    5,    2,  ...,    0,    0,    0],
        [7310,  128,   26,  ...,    0,    0,    0],
        [6529,   14,    8,  ..., 5647,    6,  347],
        ...,
        [   9,   85,   23,  ...,    0,    0,    0],
        [  10,   24,  175,  ...,    0,    0,    0],
        [   9,  403,   12,  ...,    0,    0,    0]]), tensor([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
        0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1])]
[tensor([[   10,     7,    23,  ...,     0,     0,     0],
        [   79,    64,    25,  ...,     0,     0,     0],
        [ 1950,     9,   118,  ...,     0,     0,     0],
        ...,
        [    2, 17859, 11706,  ...,     0,     0,     0],
        [16635,     4,    21,  ...,     0,     0,     0],
        [    0,    45,    89,  ...,     0,     0,     0]]), tensor([1, 0, 1, 1, 0, 1, 1, 1, 

        0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0])]
[tensor([[  18,   10,    7,  ...,    0,    0,    0],
        [   0,    0,  157,  ..., 1468,    2, 1838],
        [   8,  127, 6536,  ...,    0,    0,    0],
        ...,
        [ 181, 1159,    3,  ...,    0,    0,    0],
        [ 190,   71, 2154,  ...,    0,    0,    0],
        [  10,   41,   32,  ...,    0,    0,    0]]), tensor([1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
        0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])]
[tensor([[ 133,    9,   86,  ...,    0,    0,    0],
        [2054, 2059,    7,  ...,    0,    0,    0],
        [  10,   14,    3,  ...,    0,    0,    0],
        ...,
        [   9,   14,  945,  ...,    0,    0,    0],
        [   9,  120,  570,  ...,    0,    0,    0],
        [   9, 1710,  214,  ...,    0,    0,    0]]), tensor([1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 

单星号（*）：*agrs
将所以参数以元组(tuple)的形式导入：

此外，单星号的另一个用法是解压参数列表：

双星号（**）：**kwargs
将参数以字典的形式导入

In [12]:
def foo(a, b=10, *args, **kwargs):
    print(a)
    print(b)
    print(args)
    print(kwargs)

In [13]:
foo(1, 2, 3, 4, e=5, f=6, g=7)

1
2
(3, 4)
{'e': 5, 'f': 6, 'g': 7}


In [14]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

In [50]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.decoder = nn.Linear(4*num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

In [51]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

In [55]:
# import torch_inspect as ti

In [56]:
# from torchsummary import summary

In [57]:
# net.to(device)

In [58]:
# summary(net,(64,embed_size))

In [59]:
# summary(net,(embed_size,len(vocab)))
# print(embed_size,len(vocab))
# t_p = torch.rand([100,46152])
# print(t_p)
# t_p.permute(1)
def get_parameter_number(net):
    total_num = sum(p.numel() for p in net.parameters())
    trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad)
    return {'Total': total_num, 'Trainable': trainable_num}
print(get_parameter_number(net))

{'Total': 5019202, 'Trainable': 5019202}


In [29]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))

In [30]:
# 本函数已保存在d2lzh_torch包中方便以后使用
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 21202 oov words.


In [31]:
lr, num_epochs = 0.01, 10
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
dy.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64

torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([40, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size

torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([40, 500])
epoch 1, loss 0.6047, train acc 0.669, test acc 0.736, time 67.0 sec
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.Size([64, 500])
torch.S

KeyboardInterrupt: 

In [20]:
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    print(sentence)
    print(net(sentence.view((1, -1))))
    print(label)
    return 'positive' if label.item() == 1 else 'negative'

In [None]:
# for i in vocab:
#     print(i)
# print(len(vocab))

In [21]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great']) # positive

tensor([10, 20,  7, 38, 88], device='cuda:0')
tensor([[-1.7071,  1.5537]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([1], device='cuda:0')


'positive'

In [22]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad']) # negative

tensor([10, 20,  7, 38, 97], device='cuda:0')
tensor([[ 1.9459, -1.9360]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([0], device='cuda:0')


'negative'

In [23]:
print(net)

BiRNN(
  (embedding): Embedding(46152, 100)
  (encoder): LSTM(100, 100, num_layers=2, bidirectional=True)
  (decoder): Linear(in_features=400, out_features=2, bias=True)
)


In [24]:
print(preprocess_imdb(train_data, vocab))

(tensor([[ 4319,  4182,   287,  ...,     0,     0,     0],
        [    2,  1087,    22,  ...,     0,     0,     0],
        [   10,    24, 23011,  ...,     0,     0,     0],
        ...,
        [  347,    87,   142,  ...,     0,     0,     0],
        [    2,  5265, 25428,  ...,     0,     0,     0],
        [   10,     7,    30,  ...,   154,  6119,    34]]), tensor([1, 0, 1,  ..., 1, 0, 0]))


In [72]:
print(*preprocess_imdb(train_data, vocab))

tensor([[ 2596,  4127,     4,  ...,     0,     0,     0],
        [   10,    20,    14,  ...,     0,     0,     0],
        [   10,  2337,    20,  ...,   180,     3, 27992],
        ...,
        [    9,  1710,   140,  ...,  9112,     2,  5837],
        [    9,   200,    30,  ...,     0,     0,     0],
        [ 2188,    79,    25,  ...,     6,    73,    57]]) tensor([1, 1, 0,  ..., 0, 0, 0])
