In [1]:
import jieba
import torch
from transformers import BertTokenizer, BertForTokenClassification, BertForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_entity_vector(entity_description):
    # 使用BERT模型进行编码
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = BertForTokenClassification.from_pretrained('bert-base-chinese')
    inputs = tokenizer(entity_description, return_tensors="pt")
    outputs = model(**inputs)
    entity_vector = torch.mean(outputs.logits, dim=1)
    return entity_vector


In [3]:
def get_candidate_entities(text):
    # 分词
    words = jieba.lcut(text)

    # 从知识库中搜索候选实体
    candidate_entities = []
    for word in words:
        # 这里应该使用实际的知识库进行搜索
        # 在这个原型中，我们假设知识库已经为我们提供了候选实体
        candidate_entities.append({"name": word, "description": f"实体描述: {word}"})
    
    return candidate_entities

In [4]:
def entity_recognition_and_disamb(text, candidate_entities):
    # 将实体描述转换为向量表示
    entity_vectors = []
    for entity in candidate_entities:
        entity_vector = get_entity_vector(entity["description"])
        entity_vectors.append(entity_vector)

    # 使用BERT-ENE模型对候选实体进行筛选
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
    inputs = tokenizer(text, return_tensors="pt")

    entity_scores = []
    for entity_vector in entity_vectors:
        # 将文本和实体向量拼接
        entity_vector = torch.squeeze(entity_vector).long()
        input_ids = torch.cat((inputs['input_ids'], entity_vector.unsqueeze(0)), dim=1)
        attention_mask = torch.cat((inputs['attention_mask'], torch.ones(1, entity_vector.shape[0]).long()), dim=1)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        scores = outputs.logits
        entity_scores.append(scores)

    # 预测概率进行排序
    sorted_indices = torch.argsort(torch.tensor([torch.max(scores).item() for scores in entity_scores]), descending=True)

    # 根据排序结果返回实体识别结果
    recognized_entities = [candidate_entities[i.item()] for i in sorted_indices]

    return recognized_entities

In [6]:
text = "中华人民共和国是一个经济上蒸蒸日上的国家。"
candidate_entities = get_candidate_entities(text)
recognized_entities = entity_recognition_and_disamb(text, candidate_entities)
print(recognized_entities)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

[{'name': '中华人民共和国', 'description': '实体描述: 中华人民共和国'}, {'name': '是', 'description': '实体描述: 是'}, {'name': '一个', 'description': '实体描述: 一个'}, {'name': '经济', 'description': '实体描述: 经济'}, {'name': '上', 'description': '实体描述: 上'}, {'name': '蒸蒸日上', 'description': '实体描述: 蒸蒸日上'}, {'name': '的', 'description': '实体描述: 的'}, {'name': '国家', 'description': '实体描述: 国家'}, {'name': '。', 'description': '实体描述: 。'}]


In [7]:
text = "体育精神具体指在体育实践活动中形成的，以健康快乐、挑战征服、公平竞争、团结协作为主要价值标准的意识、思维活动和一般心理状态 。"
candidate_entities = get_candidate_entities(text)
recognized_entities = entity_recognition_and_disamb(text, candidate_entities)
print(recognized_entities)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

[{'name': '体育精神', 'description': '实体描述: 体育精神'}, {'name': '具体', 'description': '实体描述: 具体'}, {'name': '指在', 'description': '实体描述: 指在'}, {'name': '体育', 'description': '实体描述: 体育'}, {'name': '实践', 'description': '实体描述: 实践'}, {'name': '活动', 'description': '实体描述: 活动'}, {'name': '中', 'description': '实体描述: 中'}, {'name': '形成', 'description': '实体描述: 形成'}, {'name': '的', 'description': '实体描述: 的'}, {'name': '，', 'description': '实体描述: ，'}, {'name': '以', 'description': '实体描述: 以'}, {'name': '健康', 'description': '实体描述: 健康'}, {'name': '快乐', 'description': '实体描述: 快乐'}, {'name': '、', 'description': '实体描述: 、'}, {'name': '挑战', 'description': '实体描述: 挑战'}, {'name': '征服', 'description': '实体描述: 征服'}, {'name': '、', 'description': '实体描述: 、'}, {'name': '公平竞争', 'description': '实体描述: 公平竞争'}, {'name': '、', 'description': '实体描述: 、'}, {'name': '团结', 'description': '实体描述: 团结'}, {'name': '协作', 'description': '实体描述: 协作'}, {'name': '为', 'description': '实体描述: 为'}, {'name': '主要', 'description': '实体描述: 主要'}, {'name': '价值', 'de