In [1]:
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddlenlp
import paddlenlp as ppnlp
from paddlenlp.data import Vocab, JiebaTokenizer
from paddlenlp.datasets import load_dataset
from paddlenlp.embeddings import TokenEmbedding
from paddlenlp.seq2vec import LSTMEncoder
from collections import OrderedDict
from functools import partial

from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Stack, Pad, Dict, Tuple
from paddlenlp.datasets import MapDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 初始化TokenEmbedding，预训练embedding未下载时会自动下载并加载数据
# 内置的预训练词向量 paddlenlp.embeddings.list_embedding_name()
token_embedder = TokenEmbedding()
tokenizer = JiebaTokenizer(token_embedder.vocab)

emb_dim = token_embedder.embedding_dim
vocab_size = len(token_embedder.vocab)
pad_token_id = token_embedder.vocab['[PAD]']

[32m[2024-05-30 18:32:01,747] [    INFO][0m - Loading token embedding...[0m
[32m[2024-05-30 18:32:10,930] [    INFO][0m - Finish loading embedding vector.[0m
[32m[2024-05-30 18:32:10,930] [    INFO][0m - Token Embedding info:             
Unknown index: 635963             
Unknown token: [UNK]             
Padding index: 635964             
Padding token: [PAD]             
Shape :[635965, 300][0m


In [3]:
train_ds, dev_ds, test_ds = load_dataset('chnsenticorp', splits=['train', 'dev', 'test'])

In [4]:
max_len = 256
def convert_example(example, tokenizer):
    input_ids = tokenizer.encode(example['text'])
    seq_len = len(input_ids)
    input_ids = input_ids[:max_len]
    input_ids = input_ids + [pad_token_id] * (max_len - len(input_ids))
    return input_ids, example['label'], seq_len

trans_func = partial(convert_example, tokenizer=tokenizer)
train_ds.map(trans_func)

<paddlenlp.datasets.dataset.MapDataset at 0x1c1f10b9450>

In [5]:
batch_size=16
batchify_fn = lambda samples, fn=Tuple([
    Stack(dtype="int64"),
    Stack(dtype="int64"),
    Stack(dtype="int64")
]): fn(samples)
batch_sampler = BatchSampler(train_ds, batch_size=batch_size, shuffle=True)
train_dataloader = DataLoader(dataset=train_ds, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
items = next(iter(train_dataloader))

In [6]:
lstm_encoder = LSTMEncoder(emb_dim, 128)
for step, batch in enumerate(train_dataloader, start=1):
    input_ids, labels, seq_lens = batch
    # 输入shape = [batch_size, max_seq_len]
    # 输出shape=[batch_size, max_seq_len, emb_dim]
    embedded_text = token_embedder(input_ids)
    text_repr = lstm_encoder(embedded_text, sequence_length=seq_lens)
    print(text_repr.shape)
    break

[16, 256, 300]
[16, 128]


In [10]:
class LSTMModel(nn.Layer):
    def __init__(self,
                 vocab_size,
                 num_classes,
                 emb_dim=128,
                 padding_idx=0,
                 lstm_hidden_size=198,
                 direction='forward',
                 lstm_layers=1,
                 dropout_rate=0.0,
                 pooling_type=None,
                 fc_hidden_size=96):
        super().__init__()
        # 首先将输入word id 查表后映射成 word embedding
        self.embedder = token_embedder
        # 将word embedding经过LSTMEncoder变换到文本语义表征空间中
        self.lstm_encoder = ppnlp.seq2vec.LSTMEncoder(
            emb_dim,
            lstm_hidden_size,
            num_layers=lstm_layers,
            direction=direction,
            dropout=dropout_rate,
            pooling_type=pooling_type)

        # LSTMEncoder.get_output_dim()方法可以获取经过encoder之后的文本表示hidden_size
        self.fc = nn.Linear(self.lstm_encoder.get_output_dim(), fc_hidden_size)

        # 最后的分类器
        self.output_layer = nn.Linear(fc_hidden_size, num_classes)

    def forward(self, text, seq_len):
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)

        # Shape: (batch_size, num_tokens, num_directions*lstm_hidden_size)
        # num_directions = 2 if direction is 'bidirectional' else 1
        text_repr = self.lstm_encoder(embedded_text, sequence_length=seq_len)


        # Shape: (batch_size, fc_hidden_size)
        fc_out = paddle.tanh(self.fc(text_repr))

        # Shape: (batch_size, num_classes)
        logits = self.output_layer(fc_out)
        
        # probs 分类概率值
        probs = F.softmax(logits, axis=-1)
        return probs

model= LSTMModel(
    vocab_size = len(token_embedder.vocab),
    num_classes = len(train_ds.label_list),
    emb_dim = token_embedder.embedding_dim,
    direction='bidirectional',
    padding_idx = token_embedder.vocab['[PAD]'])
model = paddle.Model(model)