# 中文情感分析

In [1]:
!pip install --upgrade paddlenlp

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


## 1. 导入数据

In [2]:
from paddlenlp.datasets import load_dataset

# 加载数据
train_data, dev_data, test_data = load_dataset('chnsenticorp', splits=['train', 'dev', 'test'])

# 查看数据
for data in train_data[:5]:
    print(data)

{'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 'label': 1, 'qid': ''}
{'text': '15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错', 'label': 1, 'qid': ''}
{'text': '房间太小。其他的都一般。。。。。。。。。', 'label': 0, 'qid': ''}
{'text': '1.接电源没有几分钟,电源适配器热的不行. 2.摄像头用不起来. 3.机盖的钢琴漆，手不能摸，一摸一个印. 4.硬盘分区不好办.', 'label': 0, 'qid': ''}
{'text': '今天才知道这书还有第6卷,真有点郁闷:为什么同一套书有两种版本呢?当当网是不是该跟出版社商量商量,单独出个第6卷,让我们的孩子不会有所遗憾。', 'label': 1, 'qid': ''}


## 2. 构建词表

In [3]:
from collections import defaultdict
import re

from paddlenlp import Taskflow
import numpy as np
import paddle

# 采用Taskflow作为切词工具
word_segmenter = Taskflow('word_segmentation')
word_segmenter('PaddleNLP的Taskflow是一个全能的NLP工具')

['PaddleNLP', '的', 'Taskflow', '是', '一个', '全能', '的', 'NLP', '工具']

In [4]:
texts = []
for data in train_data:
    texts.append(data['text'])
for data in dev_data:
    texts.append(data['text'])

texts[:5]

['选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
 '15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错',
 '房间太小。其他的都一般。。。。。。。。。',
 '1.接电源没有几分钟,电源适配器热的不行. 2.摄像头用不起来. 3.机盖的钢琴漆，手不能摸，一摸一个印. 4.硬盘分区不好办.',
 '今天才知道这书还有第6卷,真有点郁闷:为什么同一套书有两种版本呢?当当网是不是该跟出版社商量商量,单独出个第6卷,让我们的孩子不会有所遗憾。']

In [5]:
from typing import List, Dict

# 停用词
stopwords = set(['的', '吗', '吧', '呀', '呜', '呢', '呗', ',', '，', '。', '？', '.', ';', ':', '!', ' '])

def build_vocab(texts: List[str],
                stopwords: List[str]=[],
                num_words: int=None,
                min_freq: int=10,
                unk_token: str='[UNK]',
                pad_token: str='[PAD]') -> Dict:
    """
    Args:
        texts: 原始语料库数据
        stopwords: 停用词
        num_words: 词典中最大的单词数
        min_freq: 要保留词的最小词频
    Returns:
        word_index: 原始语料库的字典
    """
    word_counts = defaultdict(int)
    for text in texts:
        if not text:
            continue
        for word in word_segmenter(text):
            if word in stopwords:
                continue
            word_counts[word] += 1
    
    wcounts = []
    for word, count in word_counts.items():
        if count < min_freq:
            continue
        wcounts.append((word, count))
    
    wcounts.sort(key=lambda x: x[1], reverse=True)
    # -2 是为了unk_token 和 pad_token
    if num_words is not None and len(wcounts) > (num_words - 2):
        wcounts = wcounts[:(num_words - 2)]
    sorted_voc = [pad_token, unk_token]
    sorted_voc.extend(wc[0] for wc in wcounts)
    word_index = dict(zip(sorted_voc, list(range(len(sorted_voc)))))
    return word_index

word2idx = build_vocab(
    texts=texts,
    stopwords=stopwords,
    min_freq=5
)
cnt = 0
for word, idx in word2idx.items():
    print(word, idx)
    cnt += 1
    if cnt == 5:
        break

[PAD] 0
[UNK] 1
了 2
是 3
我 4


In [6]:
from paddlenlp.data import Vocab

vocab = Vocab.from_dict(
    token_to_idx=word2idx,
    unk_token='[UNK]',
    pad_token='[PAD]'
)

vocab_json_str = vocab.to_json('./vocab.json')

## 3. 数据处理

In [7]:
from paddlenlp.data import Stack, Pad, Tuple

a = [1, 2, 3, 4]
b = [3, 4, 5, 6]
c = [5, 6, 7, 8]
res = Stack()([a, b, c])
print('Stacked Result: \n', res)

Stacked Result: 
 [[1 2 3 4]
 [3 4 5 6]
 [5 6 7 8]]


In [8]:
a = [1, 2, 3, 4]
b = [5, 6, 7]
c = [8, 9]
res = Pad(pad_val=0)([a, b, c])
print('Padded Result: \n', res)

Padded Result: 
 [[1 2 3 4]
 [5 6 7 0]
 [8 9 0 0]]


In [9]:
data = [
    [[1, 2, 3, 4], [1]],
    [[5, 6, 7], [0]],
    [[8, 9], [1]]
]

batchify_fn = Tuple(Pad(pad_val=0), Stack())
ids, labels = batchify_fn(data)
print('ids: \n', ids)
print('labels: \n', labels)

ids: 
 [[1 2 3 4]
 [5 6 7 0]
 [8 9 0 0]]
labels: 
 [[1]
 [0]
 [1]]


In [10]:
from functools import partial
from paddlenlp.data import JiebaTokenizer

def convert_example(example, tokenizer, is_test=False):
    """
    Args:
        example: 输入数据列表，包含文本和标签
        tokenizer: 使用jieba来分割中文文本
        is_test: 输入数据是否为测试数据
    Returns:
        input_ids: 词id列表
        valid_length：输入文本有效长度
        label: 输入标签
    """
    input_ids = tokenizer.encode(example['text'])
    valid_length = np.array(len(input_ids), dtype='int64')
    input_ids = np.array(input_ids, dtype='int64')

    if not is_test:
        label = np.array(example['label'], dtype='int64')
        return input_ids, valid_length, label
    else:
        return input_ids, valid_length

tokenizer = JiebaTokenizer(vocab=vocab)
trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=False)

batchify_fn = lambda samples, fn = Tuple(
    # input_ids
    Pad(pad_val=vocab.token_to_idx.get('[PAD]', 0)),
    # valid_length (seq len)
    Stack(dtype='int64'),
    # label
    Stack(dtype='int64')
): [data for data in fn(samples)]

def create_dataloader(dataset,
                    trans_fn=None,
                    mode='train',
                    batch_size=1,
                    batchify_fn=None) -> paddle.io.DataLoader:
    """
    Args:
        dataset: 数据集
        trans_fn: 将数据样本转为input_ids, seq len, label
        mode: 是否为训练模式
        batch_size: mini-batch的大小
        batchify_fn: 将mini-batch数据合并为一个列表
    Returns:
        dataloader: 用于生成batch的dataloader
    """
    if trans_fn:
        dataset = dataset.map(trans_fn)
    
    shuffle = True if mode=='train' else False
    if mode == 'train':
        sampler = paddle.io.DistributedBatchSampler(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle
        )
    else:
        sampler = paddle.io.BatchSampler(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle
        )
    dataloader = paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=sampler,
        collate_fn=batchify_fn
    )
    return dataloader

batch_size = 64

train_loader = create_dataloader(
    dataset=train_data,
    trans_fn=trans_fn,
    mode='train',
    batch_size=batch_size,
    batchify_fn=batchify_fn
)
dev_loader = create_dataloader(
    dataset=dev_data,
    trans_fn=trans_fn,
    mode='validation',
    batch_size=batch_size,
    batchify_fn=batchify_fn
)

## 4. 导入模型

In [11]:
from models.TextRNN import LSTMModel
from models.TextCNN import TextCNNModel
from models.TextBiLSTM_Att import BiLSTMAttentionModel, SelfAttention, SelfInteractiveAttention

In [12]:
model = LSTMModel(
    vocab_size=len(vocab),
    num_classes=len(train_data.label_list),
    direction='bidirectional',
    padding_idx=vocab.to_indices('[PAD]')
)

model = paddle.Model(model)

## 5. 构造优化器，损失函数和评价指标

In [13]:
optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(),
    learning_rate=5e-5
)

loss = paddle.nn.CrossEntropyLoss()

metric = paddle.metric.Accuracy()

model.prepare(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)

## 6. 模型训练与评估

In [14]:
model.fit(
    train_data=train_loader,
    eval_data=dev_loader,
    epochs=10,
    save_dir='./checkpoints',
    save_freq=5
)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step  10/150 - loss: 0.7011 - acc: 0.5344 - 89ms/step
step  20/150 - loss: 0.6996 - acc: 0.5328 - 84ms/step
step  30/150 - loss: 0.6945 - acc: 0.5193 - 80ms/step
step  40/150 - loss: 0.6898 - acc: 0.5160 - 77ms/step
step  50/150 - loss: 0.6939 - acc: 0.5112 - 75ms/step
step  60/150 - loss: 0.6921 - acc: 0.5112 - 75ms/step
step  70/150 - loss: 0.6946 - acc: 0.5038 - 75ms/step
step  80/150 - loss: 0.6931 - acc: 0.4992 - 74ms/step
step  90/150 - loss: 0.6940 - acc: 0.5016 - 73ms/step
step 100/150 - loss: 0.6939 - acc: 0.4995 - 72ms/step
step 110/150 - loss: 0.6924 - acc: 0.5109 - 72ms/step
step 120/150 - loss: 0.6922 - acc: 0.5207 - 72ms/step
step 130/150 - loss: 0.6920 - acc: 0.5236 - 71ms/step
step 140/150 - loss: 0.6923 - acc: 0.5320 - 72ms/step
step 150/150 - loss: 0.6917 - acc: 0.5375 - 70ms/step
save checkpoint at /home/aistudio/checkpoints/0
Eval begin...
step 10

In [15]:
model = TextCNNModel(
    vocab_size=len(vocab),
    num_classes=len(train_data.label_list),
    padding_idx=vocab.to_indices('[PAD]')
)

model = paddle.Model(model)

optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(),
    learning_rate=5e-5
)

loss = paddle.nn.CrossEntropyLoss()

metric = paddle.metric.Accuracy()

model.prepare(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)

model.fit(
    train_data=train_loader,
    eval_data=dev_loader,
    epochs=10,
    save_dir='./checkpoints1',
    save_freq=5
)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step  10/150 - loss: 0.6928 - acc: 0.4766 - 76ms/step
step  20/150 - loss: 0.6929 - acc: 0.4961 - 64ms/step
step  30/150 - loss: 0.6925 - acc: 0.4995 - 61ms/step
step  40/150 - loss: 0.6925 - acc: 0.5023 - 59ms/step
step  50/150 - loss: 0.6932 - acc: 0.5006 - 60ms/step
step  60/150 - loss: 0.6933 - acc: 0.4924 - 60ms/step
step  70/150 - loss: 0.6922 - acc: 0.5018 - 60ms/step
step  80/150 - loss: 0.6926 - acc: 0.5107 - 59ms/step
step  90/150 - loss: 0.6912 - acc: 0.5149 - 60ms/step
step 100/150 - loss: 0.6880 - acc: 0.5212 - 60ms/step
step 110/150 - loss: 0.6926 - acc: 0.5209 - 60ms/step
step 120/150 - loss: 0.6907 - acc: 0.5186 - 59ms/step
step 130/150 - loss: 0.6913 - acc: 0.5169 - 59ms/step
step 140/150 - loss: 0.6926 - acc: 0.5228 - 58ms/step
step 150/150 - loss: 0.6886 - acc: 0.5288 - 57ms/step
save checkpoint at /home/aistudio/checkpoints1/0
Eval begin...
step 1

In [16]:
model = BiLSTMAttentionModel(
    vocab_size=len(vocab),
    num_classes=len(train_data.label_list),
    padding_idx=vocab.to_indices('[PAD]'),
    attention_layer=SelfAttention()
)

model = paddle.Model(model)

optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(),
    learning_rate=5e-5
)

loss = paddle.nn.CrossEntropyLoss()

metric = paddle.metric.Accuracy()

model.prepare(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)

model.fit(
    train_data=train_loader,
    eval_data=dev_loader,
    epochs=10,
    save_dir='./checkpoints2',
    save_freq=5
)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step  10/150 - loss: 0.6932 - acc: 0.4938 - 97ms/step
step  20/150 - loss: 0.6919 - acc: 0.4984 - 85ms/step
step  30/150 - loss: 0.6930 - acc: 0.4938 - 78ms/step
step  40/150 - loss: 0.6943 - acc: 0.5012 - 79ms/step
step  50/150 - loss: 0.6918 - acc: 0.5072 - 80ms/step
step  60/150 - loss: 0.6919 - acc: 0.5005 - 80ms/step
step  70/150 - loss: 0.6923 - acc: 0.5004 - 79ms/step
step  80/150 - loss: 0.6922 - acc: 0.5004 - 79ms/step
step  90/150 - loss: 0.6908 - acc: 0.5007 - 78ms/step
step 100/150 - loss: 0.6926 - acc: 0.5020 - 79ms/step
step 110/150 - loss: 0.6927 - acc: 0.5014 - 78ms/step
step 120/150 - loss: 0.6897 - acc: 0.5022 - 78ms/step
step 130/150 - loss: 0.6907 - acc: 0.5028 - 78ms/step
step 140/150 - loss: 0.6891 - acc: 0.5031 - 78ms/step
step 150/150 - loss: 0.6906 - acc: 0.5061 - 77ms/step
save checkpoint at /home/aistudio/checkpoints2/0
Eval begin...
step 1

In [17]:
model = BiLSTMAttentionModel(
    vocab_size=len(vocab),
    num_classes=len(train_data.label_list),
    padding_idx=vocab.to_indices('[PAD]'),
    attention_layer=SelfInteractiveAttention()
)

model = paddle.Model(model)

optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(),
    learning_rate=5e-5
)

loss = paddle.nn.CrossEntropyLoss()

metric = paddle.metric.Accuracy()

model.prepare(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)

model.fit(
    train_data=train_loader,
    eval_data=dev_loader,
    epochs=10,
    save_dir='./checkpoints3',
    save_freq=5
)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step  10/150 - loss: 0.6931 - acc: 0.4828 - 99ms/step
step  20/150 - loss: 0.6928 - acc: 0.4906 - 102ms/step
step  30/150 - loss: 0.6925 - acc: 0.4964 - 107ms/step
step  40/150 - loss: 0.6925 - acc: 0.4953 - 102ms/step
step  50/150 - loss: 0.6931 - acc: 0.4978 - 98ms/step
step  60/150 - loss: 0.6952 - acc: 0.5003 - 95ms/step
step  70/150 - loss: 0.6938 - acc: 0.5029 - 93ms/step
step  80/150 - loss: 0.6929 - acc: 0.4990 - 92ms/step
step  90/150 - loss: 0.6924 - acc: 0.5014 - 91ms/step
step 100/150 - loss: 0.6928 - acc: 0.5106 - 92ms/step
step 110/150 - loss: 0.6927 - acc: 0.5119 - 91ms/step
step 120/150 - loss: 0.6891 - acc: 0.5115 - 91ms/step
step 130/150 - loss: 0.6918 - acc: 0.5113 - 91ms/step
step 140/150 - loss: 0.6904 - acc: 0.5100 - 90ms/step
step 150/150 - loss: 0.6898 - acc: 0.5216 - 90ms/step
save checkpoint at /home/aistudio/checkpoints3/0
Eval begin...
ste

## 7. 模型预测

In [18]:
trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=True)

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=vocab['[PAD]']),  # input_ids
    Stack(dtype="int64"),  # seq len
): [data for data in fn(samples)]

test_loader = create_dataloader(
    test_data,
    trans_fn=trans_fn,
    batch_size=batch_size,
    mode='test',
    batchify_fn=batchify_fn)

In [20]:
import numpy as np
label_map = {0: 'negative', 1: 'positive'}

# 采用最优模型进行预测
model = BiLSTMAttentionModel(
    vocab_size=len(vocab),
    num_classes=len(train_data.label_list),
    padding_idx=vocab.to_indices('[PAD]'),
    attention_layer=SelfAttention()
)
state_dict = paddle.load('./checkpoints2/final.pdparams')
model.set_dict(state_dict)
model = paddle.Model(model)
model.prepare(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)
# 将读入的数据batch化处理，便于模型batch化运算。
# batch中的每个句子将会padding到这个batch中的文本最大长度batch_max_seq_len。
# 当文本长度大于batch_max_seq时，将会截断到batch_max_seq_len；当文本长度小于batch_max_seq时，将会padding补齐到batch_max_seq_len.

results = model.predict(test_loader, batch_size=64)[0]
predictions = []
for batch_probs in results:
    # 映射分类label
    idx = np.argmax(batch_probs, axis=-1)
    idx = idx.tolist()
    labels = [label_map[i] for i in idx]
    predictions.extend(labels)

# 看看预测数据前5个样例分类结果
for idx, data in enumerate(test_data.data[:5]):
    print('Data: {} \t Label: {}'.format(data['text'], predictions[idx]))

Predict begin...
Predict samples: 1200
Data: 这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般 	 Label: negative
Data: 怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片！开始还怀疑是不是赠送的个别现象，可是后来发现每张DVD后面都有！真不知道生产商怎么想的，我想看的是猫和老鼠，不是米老鼠！如果厂家是想赠送的话，那就全套米老鼠和唐老鸭都赠送，只在每张DVD后面添加一集算什么？？简直是画蛇添足！！ 	 Label: negative
Data: 还稍微重了点，可能是硬盘大的原故，还要再轻半斤就好了。其他要进一步验证。贴的几种膜气泡较多，用不了多久就要更换了，屏幕膜稍好点，但比没有要强多了。建议配赠几张膜让用用户自己贴。 	 Label: negative
Data: 交通方便；环境很好；服务态度很好 房间较小 	 Label: positive
Data: 不错，作者的观点很颠覆目前中国父母的教育方式，其实古人们对于教育已经有了很系统的体系了，可是现在的父母以及祖父母们更多的娇惯纵容孩子，放眼看去自私的孩子是大多数，父母觉得自己的孩子在外面只要不吃亏就是好事，完全把古人几千年总结的教育古训抛在的九霄云外。所以推荐准妈妈们可以在等待宝宝降临的时候，好好学习一下，怎么把孩子教育成一个有爱心、有责任心、宽容、大度的人。 	 Label: positive
