In [2]:
import collections 
import d2lzh as d2l
from mxnet import gluon,init,nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata,loss as gloss,nn,rnn,utils as gutils
import os 
import random
import tarfile



In [3]:
# 读取数据集

def download_imdb(data_dir = "../data/"):
    # url = ("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
    # sha1 = "01ada507287d82875905620988597833ad4e0903"
    # fname = gutils.download(url, data_dir, sha1_hash=sha1)
    fname = "../data/aclImdb_v1.tar.gz"
    
    with tarfile.open(fname, 'r') as f:
        f.extractall(data_dir)
        
download_imdb()

In [4]:
# 读取训练数据集和测试数据集；每个样本是一条评论以及其对应的标签：1：正面；0：负面
def read_imdb(folder = 'train') :
    data = []
    for label in ['pos','neg']:
        folder_name = os.path.join("../data/aclImdb/",folder, label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode("utf-8").replace("\n","").lower()
                data.append([review ,1 if label == 'pos' else 0])
    random.shuffle(data)
    return data
        
train_data,test_data = read_imdb("train"),read_imdb("test")

In [5]:
# 预处理数据

def get_tokenized_imdb(data): 
    def tokenizer(text):
        return [tok.lower() for tok in text.split(" ")]
    return [tokenizer(review) for review,_ in data]

# 创建词典
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return text.vocab.Vocabulary(counter, min_freq = 5, reserved_tokens = ['<pad>'])

vocab = get_vocab_imdb(train_data)

'# words in vocab:', len(vocab)

('# words in vocab:', 46152)

In [6]:
# 由于每条评论长度不一，不以直接组合成小批量
# 通过词典转换成词索引，然后通过截断或补 '<pad>' (padding) 符号来将每条评论长度固定为500

def preprocess_imdb(data,vocab):
    max_l = 500 
    
    def pad(x):
        return x[:max_l] if len(x)>max_l else x + [vocab.token_to_idx['<pad>']] * (max_l - len(x))
    
    tokenized_data = get_tokenized_imdb(data)
    features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
    labels = nd.array([score for _,score in data])
    
    return features,labels

In [7]:
# 创建数据迭代器，每次迭代一个小批量的数据
batch_size = 64
train_set = gdata.ArrayDataset(*preprocess_imdb(train_data,vocab))
test_set = gdata.ArrayDataset(*preprocess_imdb(test_data, vocab))
train_iter = gdata.DataLoader(train_set, batch_size, shuffle = True)
test_iter = gdata.DataLoader(test_set, batch_size)


In [8]:
for X,y in train_iter:
    print("X",X.shape, "y",y.shape)
    break
    
"# batches:",len(train_iter)

X (64, 500) y (64,)


('# batches:', 391)

In [9]:
# 使用循环神经网络的模型
# 在这个模型中，每个词先通过嵌入层得到特征向量。
# 然后，使用双向循环神经网络对特征序列进一步编码得到序列信息。
# 最后，将编码的序列信息通过全连接层变换为输出；
# 具体地，将双向长短期记忆在最初时间步和最终时间步的隐藏状态连结，作为特征序列的表征传递给输出层分类。

class BiRNN(nn.Block):
    def __init__(self,vocab,embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # bidirectional 设为True 即得到双向循环神经网络
        self.encoder = rnn.LSTM(num_hiddens, num_layers = num_layers, bidirectional = True, input_size = embed_size)
        self.decoder = nn.Dense(2)
        
    def forward(self, inputs):
        # inputs 的形状是(批大小，词数), 因为LSTM 需要将序列作为第一维，所以将输入转置后再提取词特征，输出形状为（词数，批大小，词向量维度）
        embeddings = self.embedding(inputs.T)
        # rnn.LSTM 只传入输入embeddings, 因此只返回最后一层的隐藏层在各时间步的隐藏状态
        # outputs 形状是（词数，批大小，2*隐藏单元个数）
        outputs = self.encoder(embeddings)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入，它的形状是 （批大小，4*隐藏单元个数）
        encoding = nd.concat(outputs[0], outputs[-1])
        outs = self.decoder(encoding)
        return outs 
    
    

In [10]:
# 创建一个含两个隐藏层的双向循环神经网络
embed_size , num_hiddens, num_layers, ctx = 100,100,2,d2l.try_all_gpus()
net = BiRNN(vocab,embed_size, num_hiddens, num_layers)
net.initialize(init.Xavier() ,ctx = ctx)

In [None]:
# 加载预训练的词向量
glove_embedding = text.embedding.create('glove', pretrained_file_name = 'glove.6B.100d.txt',vocabulary = vocab)

Downloading C:\Users\86150\AppData\Roaming\mxnet\embeddings\glove\glove.6B.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.6B.zip...


In [None]:
# 将用这些词向量作为评论中每个词的特征向量。
net.embedding.weight.set_data(glove_embedding.idx_to_vec)
net.embedding.collect_params().setattr("grad_req", "null")

In [None]:
# 训练模型
lr,num_epochs = 0.01,5
trainer = gluon.Train(net.collect_params(),"adam",{"learning_rate":lr})
loss = gloss.SoftmaxEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)