In [13]:
import json
import csv
from tqdm import tqdm
from collections import defaultdict
import re

SEMANTIC_TYPES = {
    'T047': 'Disease or Syndrome',
    'T048': 'Mental or Behavioral Dysfunction',
    'T184': 'Sign or Symptom',
    'T046': 'Pathologic Function',
    'T023': 'Body Part, Organ, or Organ Component',
    'T029': 'Body Location or Region',
    'T121': 'Pharmacologic Substance',
    'T116': 'Amino Acid, Peptide, or Protein',
    'T131': 'Hazardous or Poisonous Substance',
    'T195': 'Antibiotic',
    'T060': 'Diagnostic Procedure',
    'T059': 'Laboratory Procedure',
    'T034': 'Laboratory or Test Result',
    'T201': 'Clinical Attribute',
    'T033': 'Finding',
    'T041': 'Mental Process',
    'T170': 'Intellectual Product'
}

TTY_PRIORITY = {
    'PF': 1,   # Preferred form 
    'PT': 2,   # Designated preferred name
    'MH': 3,   # Main heading
    'SY': 4,   # Designated synonym
    'SYLL': 5, # Preferred spelling
    'RSY': 6,  # Related synonym
    'PN': 7    # Product name
}

COMMON_WORDS = {
    'various', 'general', 'other', 'others', 'unspecified', 'unspecifieds',
    'miscellaneous', 'unknown', 'nonspecific', 'non-specific', 'multiple',
    'different', 'several', 'diverse', 'misc', 'varied', 'assorted'
}

def read_mrsty(file_path):
    """MRSTY.RRF"""
    cui_to_tui = defaultdict(list)
    print(f"Reading MRSTY from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='|')
        for row in tqdm(reader, desc="Reading MRSTY"):
            if len(row) < 5:
                continue
            cui, tui = row[0], row[1]    
            if tui in SEMANTIC_TYPES:
                cui_to_tui[cui].append(tui)
    return cui_to_tui

def score_name(name): 
    normalized_name = name.lower().strip('.,;:!?"\'')
    normalized_name = re.sub(r'\s+', ' ', normalized_name)
    if normalized_name in COMMON_WORDS:
        return float('-inf')  
    score = 0
    length = len(name)
    if 10 <= length <= 50:
        score += 30
    elif 5 <= length < 10:
        score += 20
    elif length > 50:
        score -= 10
    if any(c.isalpha() for c in name):
        score += 20
    special_chars = sum(1 for c in name if c in '()[]{}/\\')
    score -= special_chars * 5
    if name.isupper():
        score -= 10
    elif name[0].isupper() and name[1:].islower():
        score += 15
    if len(name) <= 3 and name.isupper():
        score -= 15
    return score

def select_best_name(names, tty_scores=None):
    if not names:
        return None
    name_scores = []
    for i, name in enumerate(names):
        score = score_name(name)
        if score == float('-inf'):
            continue
        if tty_scores and i < len(tty_scores):
            tty_priority = tty_scores[i]
            score += (8 - tty_priority) * 20 
        name_scores.append((name, score))
    if not name_scores:
        return None
    name_scores.sort(key=lambda x: (-x[1], -len(x[0])))
    return name_scores[0][0]

def collect_cui_info(mrconso_path, cui_to_tui):
    cui_info = defaultdict(lambda: {
        'names_by_tty': defaultdict(list),  # TTY -> [names]
        'all_mentions': set(),              # 所有可能的mention形式
        'tty_for_name': defaultdict(str)    # name -> TTY
    })
    print(f"Reading MRCONSO from {mrconso_path}...")
    with open(mrconso_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='|')
        for row in tqdm(reader, desc="Collecting CUI information"):
            if len(row) < 15:
                continue
            cui, lang, tty, name = row[0], row[1], row[12], row[14]
            if lang != 'ENG' or cui not in cui_to_tui:
                continue 
            name = name.strip()
            name = name.strip('.,;:!?"\'')
            name = re.sub(r'\s+', ' ', name)
            if not name or name.isdigit():
                continue
            info = cui_info[cui]
            info['names_by_tty'][tty].append(name)
            info['all_mentions'].add(name)
            info['tty_for_name'][name] = tty
    return cui_info

def process_mrconso(mrconso_path, cui_to_tui, output_file):
    cui_info = collect_cui_info(mrconso_path, cui_to_tui)
    print("Generating dictionary entries...")
    entities = []
    for cui, info in tqdm(cui_info.items(), desc="Processing CUIs"):
        types = [SEMANTIC_TYPES[tui] for tui in cui_to_tui[cui]]
        standard_name = None
        for tty in sorted(TTY_PRIORITY.keys(), key=lambda x: TTY_PRIORITY[x]):
            if tty in info['names_by_tty']:
                names = info['names_by_tty'][tty]
                tty_scores = [TTY_PRIORITY[tty]] * len(names)
                selected = select_best_name(names, tty_scores)
                if selected:
                    standard_name = selected
                    break
        if not standard_name and info['all_mentions']:
            selected = select_best_name(list(info['all_mentions']))
            if selected:
                standard_name = selected
        if not standard_name:
            continue
        for mention in info['all_mentions']:
            tty = info['tty_for_name'][mention]
            entity = {
                "mention": mention,
                "cui": cui,
                "types": types,
                "from_tty": tty,
                "entity_name": standard_name,
                "is_preferred": tty == 'PF' and mention == standard_name
            }
            entities.append(entity)

    print(f"\nWriting {len(entities)} entries to {output_file}...")
    with open(output_file, 'w', encoding='utf-8') as f:
        for entity in entities:
            json.dump(entity, f, ensure_ascii=False)
            f.write('\n')

    return len(entities)

def main():

    mrsty_path = 'MRSTY.RRF'
    mrconso_path = 'MRCONSO.RRF'
    output_file = 'umls_mentions.jsonl'

    cui_to_tui = read_mrsty(mrsty_path)
    print(f"Found {len(cui_to_tui)} CUIs with relevant semantic types")

    # 处理MRCONSO并生成实体词典
    total_entries = process_mrconso(mrconso_path, cui_to_tui, output_file)

    print(f"\nProcess completed!")
    print(f"Generated {total_entries} dictionary entries in {output_file}")

if __name__ == "__main__":
    main()


Reading MRSTY from /root/autodl-tmp/entity/umls_data/MRSTY.RRF...


Reading MRSTY: 3549497it [00:04, 801361.35it/s]


Found 1234394 CUIs with relevant semantic types
Reading MRCONSO from /root/autodl-tmp/entity/umls_data/MRCONSO.RRF...


Collecting CUI information: 14036386it [00:54, 258386.14it/s]


Generating dictionary entries...


Processing CUIs: 100%|██████████| 1233983/1233983 [00:26<00:00, 46069.59it/s]



Writing 3335457 entries to /root/autodl-tmp/entity/umls_data/umls_mentions_V5.jsonl...

Process completed!
Generated 3335457 dictionary entries in /root/autodl-tmp/entity/umls_data/umls_mentions_V5.jsonl


In [None]:
#实体词典有重复的，进行简化，来源和首选词字段也不需要，删除
import json

# 输入和输出文件路径
input_file = "/root/autodl-tmp/entity/umls_data/umls_mentions_V5.jsonl"  # 替换为您的输入文件路径
output_file = "/root/autodl-tmp/entity/umls_data/simplified_entity_dictionary.json"  # 替换为您的输出文件路径

# 用于保存去重后的实体词典
unique_entities = []

# 用于快速查找重复项的集合
seen_entities = set()

# 读取原始实体词典文件并处理
with open(input_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        entity = json.loads(line.strip())

        # 删除无用字段
        entity.pop("from_tty", None)
        entity.pop("is_preferred", None)

        # 生成唯一标识符，用于去重（mention, cui, types, entity_name）
        entity_key = (
            entity["mention"],
            entity["cui"],
            tuple(entity["types"]),
            entity["entity_name"]
        )

        # 只有不重复的条目才加入结果列表
        if entity_key not in seen_entities:
            seen_entities.add(entity_key)
            unique_entities.append(entity)

# 将简化后的实体词典保存到新的文件中
with open(output_file, 'w', encoding='utf-8') as outfile:
    for entity in unique_entities:
        json.dump(entity, outfile, ensure_ascii=False)
        outfile.write('\n')

print(f"Simplified entity dictionary saved to {output_file}. Total unique entities: {len(unique_entities)}")


Total mentions with multiple CUIs: 23400
Examples of mentions with multiple CUIs:
Mention: Dipalmitoylphosphatidylcholine, CUIs: ['C0000039', 'C0216971']
Mention: 17-hydroxysteroid dehydrogenase, CUIs: ['C0000165', 'C0525733']


Original dictionary: 3335457 lines, 1233983 unique CUIs
Simplified dictionary: 3335457 lines, 1233983 unique CUIs


Merged entity dictionary saved to /root/autodl-tmp/entity/umls_data/umls_mentions_merged.jsonl
Total number of unique mentions: 3309343



First five lines of the merged file:
{"mention": "(131)I-MAA", "cuis": ["C0000005"], "types": ["Pharmacologic Substance", "Amino Acid, Peptide, or Protein"], "entity_names": ["(131)I-Macroaggregated Albumin"]}
{"mention": "(131)I-Macroaggregated Albumin", "cuis": ["C0000005"], "types": ["Pharmacologic Substance", "Amino Acid, Peptide, or Protein"], "entity_names": ["(131)I-Macroaggregated Albumin"]}
{"mention": "1,2 Dipalmitoyl Glycerophosphocholine", "cuis": ["C0000039"], "types": ["Pharmacologic Substance"], "entity_names": ["Dipalmitoylphosphatidylcholine"]}
{"mention": "Dipalmitoylphosphatidylcholine", "cuis": ["C0000039", "C0216971"], "types": ["Pharmacologic Substance"], "entity_names": ["Colfosceril palmitate", "Dipalmitoylphosphatidylcholine"]}
{"mention": "1,2 Dipalmitoylphosphatidylcholine", "cuis": ["C0000039"], "types": ["Pharmacologic Substance"], "entity_names": ["Dipalmitoylphosphatidylcholine"]}


In [8]:
import json
import re
from collections import defaultdict
import html
from ahocorasick import Automaton
from tqdm import tqdm
import logging

ENTITY_FILE = './data/umls_mentions.jsonl'
INPUT_FILE = './data/filtered_segments_results.jsonl'
OUTPUT_FILE = './data/entity_linking.jsonl'
ERROR_LOG_FILE = './data/entity_linking_errors.log'

logging.basicConfig(filename=ERROR_LOG_FILE, level=logging.ERROR)

def is_all_uppercase_word(word):
    return len(word) >= 2 and word.isupper() and word.isalpha()

def is_valid_mention(mention):
    if re.match(r'^\d+$', mention) or len(mention) <= 1:
        return False
    if re.match(r'^[-.,;:!?]+$', mention):
        return False
    if re.match(r'^\d+\s*[A-Za-z]+$', mention):
        return False
    return True

def preprocess_text(text, preserve_case=False):
    text = html.unescape(text)
    text = re.sub(r'[|&#]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.strip('.,;:!?"\'')
    if not preserve_case:
        text = text.lower()
    return text

def load_entity_dict(entity_file):
    print(f"Loading entity dictionary from {entity_file}...")
    mention_info = defaultdict(lambda: {'cuis': set(), 'types': set(), 'entity_names': set()})
    with open(entity_file, 'r', encoding='utf-8') as f:
        for line in f:
            entity = json.loads(line)
            mention = entity['mention'].strip()
            if is_all_uppercase_word(mention):
                mention_norm = preprocess_text(mention, preserve_case=True)
            else:
                mention_norm = preprocess_text(mention).lower()
            if not is_valid_mention(mention_norm):
                continue
            mention_info[mention_norm]['cuis'].update(entity['cuis'])
            mention_info[mention_norm]['types'].update(entity['types'])
            mention_info[mention_norm]['entity_names'].update(entity['entity_names'])
    print(f"Entity dictionary loaded with {len(mention_info)} unique mentions.")
    return mention_info

def build_automaton(mention_info):
    A = Automaton()
    sorted_mentions = sorted(mention_info.keys(), key=len, reverse=True)
    for mention in sorted_mentions:
        A.add_word(mention, mention)
    A.make_automaton()
    return A

def is_common_word(word):
    common_words = {
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by',
        'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on',
        'or', 'such', 'that', 'the', 'their', 'then', 'there', 'these',
        'they', 'this', 'to', 'was', 'will', 'with', 'from', 'has', 'had',
        'have', 'its', 'may', 'were', 'when', 'where', 'who', 'which'
    }
    return word.lower().strip('.,;:!?"\'') in common_words

def is_valid_boundary(text, start, end):
    if start > 0 and text[start - 1].isalnum():
        return False
    if end < len(text) and text[end].isalnum():
        return False
    matched_text = text[start:end]
    if len(matched_text.strip()) <= 1:  
        return False
    if is_common_word(matched_text):  
        return False
    return True

def annotate_text(text, automaton, mention_info):
    original_text = text
    text_processed = preprocess_text(text)
    text_lower = text_processed.lower()
    matches = []
    for end_index, mention in automaton.iter(text_lower):
        start_index = end_index - len(mention) + 1
        if is_valid_boundary(text_lower, start_index, end_index + 1):
            mention_text = original_text[start_index:end_index + 1]
            if is_all_uppercase_word(mention):
                if mention_text == mention:
                    matches.append((start_index, end_index + 1, mention))
            else:
                matches.append((start_index, end_index + 1, mention))
    matches.sort(key=lambda x: (-len(x[2]), x[0]))
    filtered_matches = []
    covered_ranges = set()
    for start, end, mention in matches:
        if not any(start < r[1] and end > r[0] for r in covered_ranges):
            filtered_matches.append((start, end, mention))
            covered_ranges.add((start, end))
    annotations = []
    processed_mentions = set() 
    for start, end, mention in filtered_matches:
        if mention not in processed_mentions:  
            mention_text = original_text[start:end]
            entities = mention_info.get(mention, {})
            annotation = {
                "mention": mention_text,
                "cuis": list(entities.get('cuis', [])),
                "start": start,
                "end": end,
                "types": list(entities.get('types', [])),
                "entity_names": list(entities.get('entity_names', []))
            }
            if annotation not in annotations:
                annotations.append(annotation)
            processed_mentions.add(mention)  
    return annotations

def process_texts_in_batches(input_file, output_file, batch_size=1000):
    print(f"Processing texts from {input_file}...")
    mention_info = load_entity_dict(ENTITY_FILE)
    automaton = build_automaton(mention_info)
    total_lines = sum(1 for _ in open(input_file, 'r', encoding='utf-8'))
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        batch = []
        line_number = 0
        pbar = tqdm(total=total_lines, desc="Processing texts", unit="lines")
        for line in infile:
            line_number += 1
            pbar.update(1)
            try:
                data = json.loads(line)
                text = data.get('text', '')
                if not text:
                    raise ValueError("No text found in data.")
                annotations = annotate_text(text, automaton, mention_info)
                result = {
                    "text": text,
                    "annotations": annotations
                }
                batch.append(result)
                if len(batch) >= batch_size:
                    for item in batch:
                        json.dump(item, outfile, ensure_ascii=False)
                        outfile.write('\n')
                    batch = []
            except Exception as e:
                logging.error(f"Error processing line {line_number}: {str(e)}")
        if batch:
            for item in batch:
                json.dump(item, outfile, ensure_ascii=False)
                outfile.write('\n')

        pbar.close()
    print(f"Processing completed. Results saved to {output_file}.")
    print(f"Errors logged to {ERROR_LOG_FILE}.")

if __name__ == "__main__":
    process_texts_in_batches(INPUT_FILE, OUTPUT_FILE, batch_size=1000)


Processing texts from /root/autodl-tmp/entity/filtered_segments_results.jsonl...
Loading entity dictionary from /root/autodl-tmp/entity/umls_data/umls_mentions_merged.jsonl...
Entity dictionary loaded with 3147291 unique mentions.


Processing texts: 100%|██████████| 2681323/2681323 [1:45:52<00:00, 422.08lines/s]  


Processing completed. Results saved to /root/autodl-tmp/entity_linking_v2.jsonl.
Errors logged to /root/autodl-tmp/entity_linking_errors.log.


Total lines: 2681323
Random line 1: {'text': 'The two formulations were characterised with regard to size, PDI, zeta potential, and encapsulation efficiencies of OVACpG Table Physicochemical Characteristics of the PLGA NPs with OVACpG used for the AqueousNP Formulation and the dMNA. The Physicochemical Characteristics of the dMNANP Formulation are Measured while the Formulation is Liquid i. e. Before they are Added to the dMNA Average SD of three technical replicates. Size intensityweighted mean hydrodynamic particle diameters, PDI polydispersity index, ZP zeta potential, EE encapsulation efficiency dMNAs were fabricated with three different polymer formulation 5 wv PVA, 5 wv PVP, and 30 wv trehalose. All three formulations successfully formed nine sharp microneedle tips in each array Fig. dMNAs were fabricated with three different polymer formulation 5 wv PVA, 5 wv PVP, and 30 wv trehalose. Fabricated dMNAs with The Based on the results of PLGA NPs stability in the previous section, 5

Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors


Processing texts from /root/autodl-tmp/entity/filtered_segments_results.jsonl...
Loading entity dictionary from /root/autodl-tmp/entity/umls_data/umls_mentions_V5.jsonl...
Entity dictionary loaded with 3147291 unique mentions.
2091117 lines have already been processed. Resuming from line 2091118.


Processing texts: 100%|██████████| 590206/590206 [30:47<00:00, 319.42lines/s]  


Processing completed. Results saved to /root/autodl-tmp/entity_linking_results.jsonl.
Errors logged to /root/autodl-tmp/entity_linking_errors.log.
