# 基于卷积神经网络的文本分类

1. 论文 https://arxiv.org/pdf/1408.5882.pdf
2. 参考实现 https://zh.d2l.ai/chapter_natural-language-processing-applications/sentiment-analysis-cnn.html

In [None]:
!pip install paddlenlp --upgrade

In [1]:
from functools import partial
import numpy as np
import paddle
import pandas as pd
from paddle.utils import run_check
from paddle import nn
import paddlenlp
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import load_dataset
from paddlenlp import datasets, transformers
from visualdl import LogWriter
import jieba
from collections import Counter
from paddlenlp.data import Vocab
from sklearn.metrics import f1_score

run_check()
print('自然语言相关数据集：', paddle.text.__all__)

  from .autonotebook import tqdm as notebook_tqdm


Running verify PaddlePaddle program ... 


I0730 16:51:47.590281   622 interpretercore.cc:237] New Executor is Running.
W0730 16:51:47.591131   622 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.6
W0730 16:51:47.597573   622 gpu_resources.cc:149] device: 0, cuDNN Version: 8.4.


PaddlePaddle works well on 1 GPU.
PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
自然语言相关数据集： ['Conll05st', 'Imdb', 'Imikolov', 'Movielens', 'UCIHousing', 'WMT14', 'WMT16', 'ViterbiDecoder', 'viterbi_decode']


I0730 16:51:50.169422   622 interpreter_util.cc:518] Standalone Executor is Used.


In [2]:
#加载数据集
train_ds, dev_ds, test_ds = paddlenlp.datasets.load_dataset('chnsenticorp', splits=['train', 'dev', 'test'])
num_labels = len(train_ds.label_list)

In [3]:
#构建词典
words = (word for item in train_ds for word in jieba.lcut(item['text'], use_paddle=True))
words_counter = Counter(words)
vocab = Vocab(words_counter, min_freq=5, unk_token='[UNK]', pad_token='[PAD]')
json_str = vocab.to_json("./vocab.json")
#嵌入字典的大小
vocab_size = len(vocab)

print("vocab_size: ", vocab_size)

Building prefix dict from the default dictionary ...
[2023-07-30 16:51:50,571] [   DEBUG] __init__.py:113 - Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
[2023-07-30 16:51:50,572] [   DEBUG] __init__.py:132 - Loading model from cache /tmp/jieba.cache
Loading model cost 0.724 seconds.
[2023-07-30 16:51:51,296] [   DEBUG] __init__.py:164 - Loading model cost 0.724 seconds.
Prefix dict has been built successfully.
[2023-07-30 16:51:51,297] [   DEBUG] __init__.py:166 - Prefix dict has been built successfully.


vocab_size:  8253


In [4]:
_tokens = jieba.lcut(train_ds[0]['text'], use_paddle=True)
_indices = vocab.to_indices(_tokens)
print("_tokens:", _tokens)
print("_indices:", _indices)

_tokens: ['选择', '珠江', '花园', '的', '原因', '就是', '方便', '，', '有', '电动', '扶梯', '直接', '到达', '海边', '，', '周围', '餐馆', '、', '食廊', '、', '商场', '、', '超市', '、', '摊位', '一应俱全', '。', '酒店', '装修', '一般', '，', '但', '还', '算', '整洁', '。', ' ', '泳池', '在', '大堂', '的', '屋顶', '，', '因此', '很小', '，', '不过', '女儿', '倒', '是', '喜欢', '。', ' ', '包', '的', '早餐', '是', '西式', '的', '，', '还', '算', '丰富', '。', ' ', '服务', '吗', '，', '一般']
_indices: [202, 1, 1635, 3, 404, 40, 107, 2, 17, 1, 1, 364, 982, 1915, 2, 675, 2979, 27, 1, 27, 1525, 27, 1420, 27, 1, 4366, 4, 14, 288, 94, 2, 45, 21, 244, 1062, 4, 6, 3362, 13, 310, 3, 7572, 2, 1047, 482, 2, 95, 374, 466, 7, 58, 4, 6, 543, 3, 106, 7, 3218, 3, 2, 21, 244, 534, 4, 6, 44, 343, 2, 94]


In [5]:
max_len = 700
pad_token_id = vocab.to_indices("[PAD]")
def convert_example(example, vocab):
    _tokens = jieba.lcut(example['text'], use_paddle=True)
    input_ids = vocab.to_indices(_tokens)
    input_ids = input_ids[:max_len]
    input_ids = input_ids + [pad_token_id] * (max_len - len(input_ids))
    return input_ids, example['label']

trans_func = partial(convert_example, vocab=vocab)
train_ds.map(trans_func)

<paddlenlp.datasets.dataset.MapDataset at 0x7ff1387f60e0>

In [6]:
batch_size=128
batchify_fn = lambda samples, fn=Tuple([
    Stack(dtype="int64"),
    Stack(dtype="int64")
]): fn(samples)
batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=batch_size, shuffle=True)
train_dataloader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
items = next(iter(train_dataloader))

In [7]:
# lens = []
# for input_ids, label in train_ds:
#     lens.append(len(input_ids))

# df = pd.DataFrame(lens, columns=['len'])
# df.describe(percentiles=[0.5, 0.999]) #700

In [8]:
import paddle
import paddle.nn as nn
import paddlenlp as nlp

class TextCNN(nn.Layer):
    def __init__(self,
                vocab_size,
                num_classes,
                emb_dim=128,
                padding_idx=0,
                num_filter=128,
                ngram_filter_sizes=(3, 4, 5),
                fc_hidden_size=96):
        super().__init__()
        self.embedder = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
        self.convs = paddle.nn.LayerList(
            [nn.Conv2D(in_channels=1, out_channels=num_filter, kernel_size=(kernel_size, emb_dim)) for kernel_size in ngram_filter_sizes]
        )
        self.activation = nn.Tanh()
        maxpool_output_dim = num_filter * len(ngram_filter_sizes)
        self.output_layer = nn.Linear(maxpool_output_dim, num_classes)
        

    def encoder(self, embeddings):
        embeddings = embeddings.unsqueeze(1)
        convs_out = [self.activation(conv(embeddings)).squeeze(3) for conv in self.convs]
        maxpool_out = [F.adaptive_max_pool1d(t, output_size=1).squeeze(2) for t in convs_out]
        return paddle.concat(maxpool_out, axis=1)

    def forward(self, input_ids):
        # Shape: (batch_size, num_tokens, embedding_dim)
        embeddings = self.embedder(input_ids)
        encoding = self.encoder(embeddings)
        # Shape: (batch_size, num_classes)
        logits = self.output_layer(encoding)
        return logits

model = TextCNN(vocab_size=vocab_size, num_classes=num_labels)

In [9]:
# 定义 optimizer 优化器
optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters())
# 定义 loss
criterion = paddle.nn.loss.CrossEntropyLoss()

metric = paddle.metric.Accuracy()
# 训练
epochs = 10
global_step = 0
with LogWriter(logdir="./logs") as writer:
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_dataloader, start=1):
            input_ids, labels = batch
            logits = model(input_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            # 预测分类概率
            correct = metric.compute(logits, labels)
            metric.update(correct)
            acc = metric.accumulate()

            writer.add_scalar(tag="acc", step=global_step, value=acc)
            # 向记录器添加一个tag为`loss`的数据
            writer.add_scalar(tag="loss", step=global_step, value=loss)
            global_step += 1
            if global_step % 10 == 0:
                print("epoch %d, step %d: loss:%.5f, acc:%.5f" % (epoch, step, loss, acc))
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

epoch 1, step 10: loss:0.68145, acc:0.51953
epoch 1, step 20: loss:0.64644, acc:0.63281
epoch 1, step 30: loss:0.57183, acc:0.67526
epoch 1, step 40: loss:0.51998, acc:0.69824
epoch 1, step 50: loss:0.43974, acc:0.71625
epoch 1, step 60: loss:0.43186, acc:0.73281
epoch 1, step 70: loss:0.41584, acc:0.74844
epoch 2, step 5: loss:0.26225, acc:0.76572
epoch 2, step 15: loss:0.31434, acc:0.78047
epoch 2, step 25: loss:0.17671, acc:0.79273
epoch 2, step 35: loss:0.24877, acc:0.80355
epoch 2, step 45: loss:0.25345, acc:0.81289
epoch 2, step 55: loss:0.22099, acc:0.82073
epoch 2, step 65: loss:0.20602, acc:0.82785
epoch 2, step 75: loss:0.22985, acc:0.83427
epoch 3, step 10: loss:0.11934, acc:0.84272
epoch 3, step 20: loss:0.08393, acc:0.85014
epoch 3, step 30: loss:0.11284, acc:0.85660
epoch 3, step 40: loss:0.11860, acc:0.86262
epoch 3, step 50: loss:0.05977, acc:0.86805
epoch 3, step 60: loss:0.13798, acc:0.87206
epoch 3, step 70: loss:0.11340, acc:0.87610
epoch 4, st

In [10]:
input_ids, labels = next(iter(train_dataloader))
logits = model(input_ids)
pred = paddle.argmax(logits, axis=-1)
print("f1_score:", f1_score(labels, pred))

f1_score: 1.0
