### **NLP应用——情感分析**

In [1]:
import torch
from torch import nn
from d2l import torch as d2l
from net_frame import *

**一、数据预处理**

In [2]:
import os
from tqdm import tqdm # 使用会减慢循环效率
import time

# 方法1
def get_raw_data(data_path = 'aclImdb',istrain = True):
    """获取原始的sentence及标签"""
    start = time.perf_counter()
    path_prefix = os.path.join(data_path,'train' if istrain else 'test')
    data = []
    labels = []
    for label in ['pos','neg']:
        folder = os.path.join(path_prefix,label)

        # 这样写比在最后加loop.set_description(f"read {label}")快50倍
        loop = tqdm(os.listdir(folder),total = len(os.listdir(folder)),desc = 'read ' + label)
        for filename in loop:
        # for filename in os.listdir(folder):
            filename = os.path.join(folder,filename)
            with open(filename,'r',encoding = 'utf-8') as file:
                for line in file:
                    data.append(line.strip())
                    labels.append(1 if label == 'pos' else 0)
    end = time.perf_counter()
    print(f"Load data use time:{end - start}s")
    return data,labels

# 方法2
def get_raw_data2(data_path = 'aclImdb',istrain = True):
    import glob
    """使用glob遍历文件"""
    start = time.perf_counter()
    path_prefix = os.path.join(data_path,'train' if istrain else 'test')
    data = []
    labels = []
    for label in ['pos','neg']:
        folder = os.path.join(path_prefix,label,"*.txt")
        loop = tqdm(glob.glob(folder),total = len(glob.glob(folder)),desc = 'read ' + label)
        for filename in loop:
            with open(filename,'r',encoding = 'utf-8') as file:
                for line in file:
                    data.append(line.strip())
                    labels.append(1 if label == 'pos' else 0)
            loop.set_description(f"read {label}",refresh = False)
    end = time.perf_counter()
    print(f"Load data use time:{end - start}s")
    return data,labels

train_data,train_labels = get_raw_data()
# test_data,test_labels = get_raw_data(istrain = False)

read pos:   0%|          | 0/12500 [00:00<?, ?it/s]

read pos: 100%|██████████| 12500/12500 [00:00<00:00, 58309.81it/s]
read neg: 100%|██████████| 12500/12500 [00:00<00:00, 72940.97it/s]

Load data use time:0.4356949641369283s





In [3]:
# 检查数据是否读取完全
print(len(train_data))
print(len(train_labels))
# print(len(test_data))
# print(len(test_labels))

25000
25000


In [4]:
# 原始数据tokenize
source = tokenize(train_data,token = 'word')

In [5]:
# 构建词表(文本分类仅需<pad>为保留token)
source_vocab = Vocab(source,reserved_tokens = ['<pad>'])
# target_vocab = Vocab(target,reserved_tokens = ['<pad>'])

In [6]:
def build_array(text,vocab,num_steps):
    """文本token to idx,将原始文本标量化"""
    lines = [vocab[l] for l in text]
    arrays = [truncate_pad(line,num_steps,padding_token = vocab['<pad>']) for line in lines]
    return torch.tensor(arrays,dtype = torch.int32)

In [7]:
# 数据标量化
num_steps = 500
source_arrays = build_array(source,source_vocab,num_steps = 500)
target_arrays = torch.tensor(train_labels).reshape(-1,1)
print(source_arrays.shape)
print(target_arrays.shape)

torch.Size([25000, 500])
torch.Size([25000, 1])


In [8]:
# 构造数据迭代器
batch_size = 64
train_iter = data.DataLoader(
    data.TensorDataset(source_arrays,target_arrays),batch_size,shuffle = False)

In [None]:
# 测试数据是否正确加载
for batch in train_iter:
    X,Y = batch
    print(X.shape,Y.shape)
    print(X)
    str = ""
    for x in X[0]:
        str += source_vocab.to_tokens(x.item()) + " "
    print(str)
    break

torch.Size([1, 500]) torch.Size([1, 1])
tensor([[     9,    388,      6,     67, 109697,   6092,     26,   1354,     51,
              6,    543,      4,     14,     81,   3598,   6696,     19,    139,
            342,     30,   4299,   2688,     14,   1697,      4,      2,     77,
            457,     14,   2785,    503,    436,   4082,   2867,    125,     23,
            747,    514,    342,      5,    139,     15,    116,     15,   6172,
             56,    154,     28,   1769,     10,     91,     16,   1710,    889,
          10178,     27,   1632,      4,     11,     21,      7,      3,    468,
            709,      5,    475,    678,   1165,     11,    469,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,     

**二、使用RNN架构去搭建模型**

In [9]:
# Test
x = torch.rand((100,50))
print(x.permute(1,0) == x.T)

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])


In [10]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens,
                 num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # 将bidirectional设置为True以获取双向循环神经网络
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4 * num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是（批量大小，时间步数）
        # 因为长短期记忆网络要求其输入的第一个维度是时间维，
        # 所以在获得词元表示之前，输入会被转置。
        # 输出形状为（时间步数，批量大小，词向量维度）
        embeddings = self.embedding(inputs.T)
        self.encoder.flatten_parameters()
        # 返回上一个隐藏层在不同时间步的隐状态，
        # outputs的形状是（时间步数，批量大小，2*隐藏单元数）
        outputs, _ = self.encoder(embeddings)
        # 连结初始和最终时间步的隐状态，作为全连接层的输入，
        # 其形状为（批量大小，4*隐藏单元数）
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs


In [11]:
# 权重初始化函数
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(m._parameters[param])