1. 使用BioBert医学专用 `biobert-large-cased` 预训练模型

2. 取BioBert隐藏层最后一层的均值代表该名词的词向量

3. 为每个 `addn_bioentity` 和每个 `bioentity_name` 里的名词计算余弦相似度

4. 每个 `addn_bioentity` 的名词得到 `bioentity_name` 中最相似的编号

#### 1 导入库和配置

In [1]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
import scipy.spatial.distance as dist
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch.utils.data import DataLoader
import concurrent.futures
# Config

## 开启梯子后 使用全局代理链接 HuggingFace
# os.environ['http_proxy'] = "127.0.0.1:7890"
# os.environ['https_proxy'] = "127.0.0.1:7890"

bioentity_name = pd.read_csv("./bioentity_name.csv")
addn_bioentity = pd.read_csv("./addn_bioentity.csv")
model_name='dmis-lab/biobert-large-cased-v1.1'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 500

#### 2 导入模型和分词器

In [2]:
biobert_tokenizer = AutoTokenizer.from_pretrained(model_name)
biobert_model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

In [3]:
def get_bert_embeddings(sentences, batch_size=32):
    # 存储embedding列表
    embeddings_list = []

    # 使用DataLoader批处理
    dataloader = DataLoader(sentences, batch_size=batch_size, shuffle=False)

    for batch_sentences in tqdm(dataloader):
        
        # 释放多余显存
        torch.cuda.empty_cache()
        
        # BioBERT
        bert_encodings = biobert_tokenizer(batch_sentences, return_tensors='pt', padding=True)
        attention_mask = bert_encodings['attention_mask'].to(device)
        bert_input_ids = bert_encodings['input_ids'].to(device)

        # 推理
        with torch.no_grad():
            bert_outputs = biobert_model(bert_input_ids, attention_mask=attention_mask, output_hidden_states=True)

        # 抽取隐藏层
        bert_cls_embeddings = bert_outputs.hidden_states[-1].mean(axis=1).cpu().numpy()
        embeddings_list.append(bert_cls_embeddings)
    
    # 合并
    result_embeddings = np.concatenate(embeddings_list, axis=0)

    return result_embeddings

#### 3 编码bioentity词向量

In [4]:
bioentit_embeddings = get_bert_embeddings(bioentity_name.name.values, batch_size=batch_size)
np.save("bioentit_embeddings.npy", bioentit_embeddings)
bioentit_embeddings.shape

100%|██████████| 95/95 [01:25<00:00,  1.11it/s]


(47222, 1024)

#### 4 编码addn_bioentity词向量

In [5]:
addn_bioentity_embeddings = get_bert_embeddings(addn_bioentity.name.values, batch_size=batch_size)
np.save("addn_bioentity_embeddings.npy", addn_bioentity_embeddings)
addn_bioentity_embeddings.shape

100%|██████████| 256/256 [05:46<00:00,  1.36s/it]


(127604, 1024)

#### 5 计算相似度 

In [2]:
bioentit_embeddings = np.load("bioentit_embeddings.npy")
addn_bioentity_embeddings = np.load("addn_bioentity_embeddings.npy")

In [3]:
# 余弦相似度的函数
def compute_similarity(i):
    similarities = [1 - dist.cosine(addn_bioentity_embeddings[i], bioentit_embeddings[j]) for j in range(len(bioentit_embeddings))]
    most_similar_index = np.argmax(similarities)
    return most_similar_index

# 进程数
num_processes = 180

# 创建进程池
with multiprocessing.Pool(processes=num_processes) as pool:
    results = list(tqdm(pool.imap(compute_similarity, range(len(addn_bioentity_embeddings))), total=len(addn_bioentity_embeddings)))


100%|██████████| 127604/127604 [24:45<00:00, 85.92it/s] 


#### 6 匹配ID并且Submit

In [23]:
### 创建匹配字典
match_dict = dict(zip(bioentity_name.index.tolist(),bioentity_name.id.tolist()))

addn_bioentity["id"] = results
addn_bioentity

Unnamed: 0,name,id
0,NAD(P)-glutamate dehydrogenase complex,14365
1,4-carboxy-4'-sulphoazobenzene reductase activity,11369
2,Ada Two-A containing complex,4328
3,up regulation of nitrogen utilization,27363
4,RetSat activity,3147
...,...,...
127599,p-benzoquinone reductase activity,11366
127600,activation of glia cell migration,43072
127601,GALT development,17451
127602,upregulation by symbiont of host phagocytosis,28279


In [26]:
### 根据字典匹配ID
addn_bioentity["id"] = addn_bioentity["id"].replace(match_dict)
addn_bioentity

Unnamed: 0,name,id
0,NAD(P)-glutamate dehydrogenase complex,0031027
1,4-carboxy-4'-sulphoazobenzene reductase activity,0018544
2,Ada Two-A containing complex,0005745
3,up regulation of nitrogen utilization,0051175
4,RetSat activity,0004276
...,...,...
127599,p-benzoquinone reductase activity,0018541
127600,activation of glia cell migration,1903975
127601,GALT development,0034270
127602,upregulation by symbiont of host phagocytosis,0052191


In [2]:
bioentity_name

Unnamed: 0,id,name
0,0000001,mitochondrion inheritance
1,0000002,mitochondrial genome maintenance
2,0000003,reproduction
3,0000005,obsolete ribosomal chaperone activity
4,0000006,high-affinity zinc transmembrane transporter a...
...,...,...
47217,part_of,part of
47218,positively_regulates,positively regulates
47219,regulates,regulates
47220,starts_during,starts_during


In [33]:
submit = pd.merge(addn_bioentity,bioentity_name,on="id")
submit.columns = ["name","id","match_name"]
submit = submit[["id","name","match_name"]]
submit

Unnamed: 0,id,name,match_name
0,0031027,NAD(P)-glutamate dehydrogenase complex,glutamate synthase complex (NADH)
1,0031027,NADH-glutamate synthase activity,glutamate synthase complex (NADH)
2,0031027,NADPH-linked glutamate synthase,glutamate synthase complex (NADH)
3,0031027,L-glutamate synthase (NADH),glutamate synthase complex (NADH)
4,0031027,NADH-dependent glutamate synthase activity,glutamate synthase complex (NADH)
...,...,...,...
127599,0001086,TFIIA-class binding transcription factor activity,"obsolete transcription factor activity, TFIIA-..."
127600,0034974,Swi1-Swi3 complex,Swi5-Swi2 complex
127601,0085000,modification by symbiont of host morphology or...,obsolete modification by symbiont of host morp...
127602,0015628,protein secretion by the type II protein secre...,protein secretion by the type II secretion system


In [34]:
submit.to_csv("submission.csv", index=False, encoding="utf_8_sig")