# 基于卷积神经网络的文本分类

1. 论文 https://arxiv.org/pdf/1408.5882.pdf
2. 参考实现 https://zh.d2l.ai/chapter_natural-language-processing-applications/sentiment-analysis-cnn.html

In [None]:
!pip install paddlenlp --upgrade

In [15]:
from functools import partial
import numpy as np
import paddle
import pandas as pd
from paddle.utils import run_check
from paddle import nn
import paddlenlp
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import load_dataset
from paddlenlp import datasets, transformers
from visualdl import LogWriter
import jieba
from collections import Counter
from paddlenlp.data import Vocab
from sklearn.metrics import f1_score

run_check()
print('自然语言相关数据集：', paddle.text.__all__)

Running verify PaddlePaddle program ... 
PaddlePaddle works well on 1 GPU.
PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
自然语言相关数据集： ['Conll05st', 'Imdb', 'Imikolov', 'Movielens', 'UCIHousing', 'WMT14', 'WMT16', 'ViterbiDecoder', 'viterbi_decode']


In [2]:
#加载数据集
train_ds, dev_ds, test_ds = paddlenlp.datasets.load_dataset('chnsenticorp', splits=['train', 'dev', 'test'])
num_labels = len(train_ds.label_list)

In [3]:
#构建词典
words = (word for item in train_ds for word in jieba.lcut(item['text'], use_paddle=True))
words_counter = Counter(words)
vocab = Vocab(words_counter, min_freq=5, unk_token='[UNK]', pad_token='[PAD]')
json_str = vocab.to_json("./vocab.json")
#嵌入字典的大小
vocab_size = len(vocab)

print("vocab_size: ", vocab_size)

Building prefix dict from the default dictionary ...
[2023-07-30 15:19:58,429] [   DEBUG] __init__.py:113 - Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
[2023-07-30 15:19:58,430] [   DEBUG] __init__.py:132 - Loading model from cache /tmp/jieba.cache
Loading model cost 0.730 seconds.
[2023-07-30 15:19:59,160] [   DEBUG] __init__.py:164 - Loading model cost 0.730 seconds.
Prefix dict has been built successfully.
[2023-07-30 15:19:59,162] [   DEBUG] __init__.py:166 - Prefix dict has been built successfully.


vocab_size:  8253


In [4]:
_tokens = jieba.lcut(train_ds[0]['text'], use_paddle=True)
_indices = vocab.to_indices(_tokens)
print("_tokens:", _tokens)
print("_indices:", _indices)

_tokens: ['选择', '珠江', '花园', '的', '原因', '就是', '方便', '，', '有', '电动', '扶梯', '直接', '到达', '海边', '，', '周围', '餐馆', '、', '食廊', '、', '商场', '、', '超市', '、', '摊位', '一应俱全', '。', '酒店', '装修', '一般', '，', '但', '还', '算', '整洁', '。', ' ', '泳池', '在', '大堂', '的', '屋顶', '，', '因此', '很小', '，', '不过', '女儿', '倒', '是', '喜欢', '。', ' ', '包', '的', '早餐', '是', '西式', '的', '，', '还', '算', '丰富', '。', ' ', '服务', '吗', '，', '一般']
_indices: [202, 1, 1635, 3, 404, 40, 107, 2, 17, 1, 1, 364, 982, 1915, 2, 675, 2979, 27, 1, 27, 1525, 27, 1420, 27, 1, 4366, 4, 14, 288, 94, 2, 45, 21, 244, 1062, 4, 6, 3362, 13, 310, 3, 7572, 2, 1047, 482, 2, 95, 374, 466, 7, 58, 4, 6, 543, 3, 106, 7, 3218, 3, 2, 21, 244, 534, 4, 6, 44, 343, 2, 94]


In [5]:
max_len = 700
pad_token_id = vocab.to_indices("[PAD]")
def convert_example(example, vocab):
    _tokens = jieba.lcut(example['text'], use_paddle=True)
    input_ids = vocab.to_indices(_tokens)
    input_ids = input_ids[:max_len]
    input_ids = input_ids + [pad_token_id] * (max_len - len(input_ids))
    return input_ids, example['label']

trans_func = partial(convert_example, vocab=vocab)
train_ds.map(trans_func)

<paddlenlp.datasets.dataset.MapDataset at 0x7f78287a6cb0>

In [6]:
batch_size=128
batchify_fn = lambda samples, fn=Tuple([
    Stack(dtype="int64"),
    Stack(dtype="int64")
]): fn(samples)
batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=batch_size, shuffle=True)
train_dataloader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
items = next(iter(train_dataloader))

In [None]:
# lens = []
# for input_ids, label in train_ds:
#     lens.append(len(input_ids))

# df = pd.DataFrame(lens, columns=['len'])
# df.describe(percentiles=[0.5, 0.999]) #700

In [7]:
import paddle
import paddle.nn as nn
import paddlenlp as nlp

class TextCNN(nn.Layer):
    def __init__(self,
                vocab_size,
                num_classes,
                emb_dim=128,
                padding_idx=0,
                num_filter=128,
                ngram_filter_sizes=(3, 4, 5),
                fc_hidden_size=96):
        super().__init__()
        # 卷积层参数分别为词表长度、词嵌入维度
        self.embedder = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
        self.encoder = nlp.seq2vec.CNNEncoder(emb_dim=emb_dim, num_filter=num_filter, ngram_filter_sizes=ngram_filter_sizes)
        self.fc = nn.Linear(self.encoder.get_output_dim(), fc_hidden_size)
        self.output_layer = nn.Linear(fc_hidden_size, num_classes)

    def forward(self, text):
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)
        # Shape: (batch_size, len(ngram_filter_sizes)*num_filter)
        encoder_out = self.encoder(embedded_text)
        encoder_out = paddle.tanh(encoder_out)
        # Shape: (batch_size, fc_hidden_size)
        fc_out = self.fc(encoder_out)
        # Shape: (batch_size, num_classes)
        logits = self.output_layer(fc_out)
        return logits

model = TextCNN(vocab_size=vocab_size, num_classes=num_labels)

In [8]:
# 定义 optimizer 优化器
optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters())
# 定义 loss
criterion = paddle.nn.loss.CrossEntropyLoss()

metric = paddle.metric.Accuracy()
# 训练
epochs = 20
global_step = 0
with LogWriter(logdir="./logs") as writer:
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_dataloader, start=1):
            input_ids, labels = batch
            logits = model(input_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            # 预测分类概率
            correct = metric.compute(logits, labels)
            metric.update(correct)
            acc = metric.accumulate()

            writer.add_scalar(tag="acc", step=global_step, value=acc)
            # 向记录器添加一个tag为`loss`的数据
            writer.add_scalar(tag="loss", step=global_step, value=loss)
            global_step += 1
            if global_step % 10 == 0:
                print("epoch %d, step %d: loss:%.5f, acc:%.5f" % (epoch, step, loss, acc))
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

epoch 1, step 10: loss:0.67402, acc:0.53672
epoch 1, step 20: loss:0.61313, acc:0.63516
epoch 1, step 30: loss:0.52036, acc:0.67370
epoch 1, step 40: loss:0.46896, acc:0.69727
epoch 1, step 50: loss:0.35472, acc:0.72828
epoch 1, step 60: loss:0.37869, acc:0.75404
epoch 1, step 70: loss:0.37543, acc:0.77009
epoch 2, step 5: loss:0.16675, acc:0.78857
epoch 2, step 15: loss:0.22088, acc:0.80451
epoch 2, step 25: loss:0.07682, acc:0.81828
epoch 2, step 35: loss:0.16649, acc:0.83040
epoch 2, step 45: loss:0.16672, acc:0.84017
epoch 2, step 55: loss:0.12934, acc:0.84820
epoch 2, step 65: loss:0.12369, acc:0.85541
epoch 2, step 75: loss:0.12018, acc:0.86182
epoch 3, step 10: loss:0.07339, acc:0.86987
epoch 3, step 20: loss:0.01983, acc:0.87684
epoch 3, step 30: loss:0.03650, acc:0.88303
epoch 3, step 40: loss:0.03005, acc:0.88869
epoch 3, step 50: loss:0.01464, acc:0.89371
epoch 3, step 60: loss:0.06316, acc:0.89795
epoch 3, step 70: loss:0.11069, acc:0.90206
epoch 4, st

In [18]:
input_ids, labels = next(iter(train_dataloader))
logits = model(input_ids)
pred = paddle.argmax(logits, axis=-1)
print("f1_score:", f1_score(labels, pred))

f1_score: 1.0
