import torch from fastNLP.embeddings import StaticEmbedding from fastNLP import DataSet from fastNLP import Vocabulary tr_data = DataSet({ 'chars': [['train', 'only_in_train']], 'target': ['positive'] }) dev_data = DataSet({ 'chars': [['test', 'only_in_test']], 'target': ['negative'] }) vocab = Vocabulary() # TODO 推荐用no_create_entry_dataset的方式 # vocab.from_dataset(tr_data, dev_data, field_name='chars') vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) # TODO 这里有4+2个词语 print(len(vocab)) print(vocab) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d') # TODO 以下两个尺寸不同,weight的尺寸要小一点 print(embed.size) #为6 print(embed.weight.size()) #为5 print(embed) print(embed.weight) # 问题1:如果再模型中直接使用embed,如下:训练时不会有问题 # self.lattice_embed = lattice_embed # self.bigram_embed = bigram_embed # 但是由于在模型中直接使用embed,那么在TorchScript的时候会遇到转化错误 # Module 'StaticEmbedding' has no attribute '_word_vocab' (This attribute exists on the Python module, but we failed to convert Python type: 'fastNLP.core.vocabulary.Vocabulary' to a TorchScript type. Only tensors and (possibly nested) tuples of tensors, lists, or dictsare supported as inputs or outputs of traced functions, but instead got value of type Vocabulary.. Its type was inferred; try adding a type annotation for the attribute.): # File "/Users/wangming/venv/lib/python3.6/site-packages/fastNLP/embeddings/embedding.py", line 196 # :return: # """ # return len(self._word_vocab) # ~~~~~~~~~~~~~~~~ <--- HERE # 问题2:由于以上用法在TorchScript时会报错,而最终模型又希望用于线上CPP服务,所以在模型定义的地方,我使用embed.weight把StaticEmbedding中的向量权重传递给torch中的nn.Embedding,使用下面的代码形式 # self.lattice_embed = nn.Embedding(lattice_num, lattice_dim) # self.lattice_embed.weight.data.copy_(lattice_weight) # self.bigram_embed = nn.Embedding(bigram_num, bigram_dim) # self.bigram_embed.weight.data.copy_(bigram_weight) # 这种方式但是由于文档中 这个值可能会大于实际的embedding矩阵的大小 这个特性,导致实际使用中,会出现下标越界的index会出现错误。 vocab.index_dataset(tr_data, dev_data, field_name='chars', new_field_name='chars_index') print(tr_data) print(dev_data)