# Dutch HPO concept table
This notebook describes how to create a HPO concept table containing Dutch names, to be used in a named entity recognition and linking tool such as MedCAT. Dutch names are added to HPO concepts in 4 steps:

1. Names from Dutch HPO translations
2. Names from Dutch UMLS (MeSH, MedDRA, ICPC and ICD-10) and SNOMED
3. Manual UMCU additions
4. Removal of UMCU blacklisted names

## Preprocessing translation file
In 2021, we received the Dutch HPO translations `hpo_notes.xliff_nl.zip` from Sebastian Köhler (lead of the HPO translation project on CrowdIn at the time) with permission from David Koolen (Radboud UMC). David's team did the translations, which mostly consists of primary names, and a few synonyms and definitions (14% of all HPO names, see https://crowdin.com/project/hpo-translation/nl). With David's permission , Sebastian shared the translations with UMC Utrecht.

The translation file, `hpo_notes.xliff_nl.zip`, was manually unzipped and converted to XLSX-format with Excel (Right click file -> Open With -> All apps -> Select Excel -> Wait a file minutes). The resulting file was saved as `hpo_notes.xlsx`.

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
from sqlalchemy import create_engine
from utils import clean_name_status_column, convert_title_to_lowercase

# Input & output
source_hpo_files_path = Path('01_Download/hpo/')
concept_table_path = Path('04_ConceptDB')
output_file = concept_table_path / 'hpo-dutch_v1.2.csv'

# Dutch HPO translations, required for first part of this notebook
hpo_translations_file = source_hpo_files_path / 'hpo_notes.xlsx'

# Dutch names from UMLS and SNOMED, required for second part of this notebook
umls_concept_table_path = Path('04_ConceptDB/umls-dutch_v1.11.csv')

# Manualy additions and deletions, required for last part of this notebook
hpo_umcu_names_file = source_hpo_files_path / 'hpo-dutch_umcu-names_20221116.csv'
blacklisted_names_file = source_hpo_files_path / 'hpo-dutch_umcu-blacklist_20221601.csv'

## 1. Names from Dutch HPO translations

In [2]:
# Read HPO translations file
hpo_translations = pd.read_excel(hpo_translations_file, dtype=str, header=1) 

# Rename columns
hpo_translations = hpo_translations.rename(columns={"/file/body/trans-unit/@id": "id",
                            "/file/body/trans-unit/note": "full_name",
                            "/file/body/trans-unit/source": "english_name",
                            "/file/body/trans-unit/target": "name",
                            "/file/body/trans-unit/target/@state": "state"})

# Select columns to keep
hpo_translations = hpo_translations[['id', 'full_name', 'english_name', 'name', 'state']]

# Replace substrings at the end of the cui-values
substrings = ['_label', '_definition', '_synonyms'] # Substrings to replace
hpo_translations['cui'] = hpo_translations.id.str.replace('|'.join(substrings), '', regex=True).str.strip()
hpo_translations.head()

# Filter the terms that are translated
total_terms = hpo_translations['cui'].nunique() # Get total number of HPO-terms
hpo_translations = hpo_translations[hpo_translations['state'] == "translated"]
hpo_translations = hpo_translations.drop(columns=['full_name', 'english_name', 'state'])
hpo_translations.head()

FileNotFoundError: [Errno 2] No such file or directory: '01_Download/hpo/hpo_notes.xlsx'

In [None]:
# Extract type
def extract_type(r):
    return r['id'].split('_')[2]
hpo_translations['type'] = hpo_translations.apply(extract_type, axis=1)

# Count number of '#' in hpo_translations. This is number of synonyms that have to be corrected.
counter=0
for i, r in hpo_translations.iterrows():
    if '#' in r['name']:
        counter += 1
print('Number of synonyms that have to be corrected because it includes #:', counter)

# Clean names. The synonyms contains # at start and as separators.
def clean_synonyms(r):
    concept_id = r['id']
    concept_name = r['name']

    # Return all non synonyms, because they are already clean
    if not 'synonyms' in concept_id:
        return [concept_name]

    # Return all synonyms that are already clean
    if not '#' in concept_name:
        return [concept_name]

    # Clean and split synonym
    cleaned_names = [n.strip() for n in concept_name.strip('#').split('#')]
    
    # Remove duplicates
    cleaned_names = list(set(cleaned_names))
    return cleaned_names

hpo_translations['cleaned_names'] = hpo_translations.apply(clean_synonyms, axis=1)
hpo_translations_cleaned_names = hpo_translations.explode('cleaned_names')

# Replace name column
hpo_translations_cleaned_names = hpo_translations_cleaned_names.drop(columns=['name'])
hpo_translations_cleaned_names.rename(columns={'cleaned_names':'name'}, inplace=True)
print('Number of synonyms that are added because multiple synonyms were in same field:', len(hpo_translations_cleaned_names)-len(hpo_translations))

In [None]:
# Remove 'definition'. These are longer descriptions of the concept and less usefull for named entity recognition
hpo_translations_definitions_removed = hpo_translations_cleaned_names[hpo_translations_cleaned_names['type'].isin(['label', 'synonyms'])].copy()
print('Number of names that are removed because they are definition: ', len(hpo_translations_cleaned_names) - len(hpo_translations_definitions_removed) )

In [None]:
# Convert title-formatted names to lowercase
hpo_translations_definitions_removed['name'] = hpo_translations_definitions_removed['name'].apply(convert_title_to_lowercase, split_char=' ')
hpo_translations_definitions_removed['name'] = hpo_translations_definitions_removed['name'].apply(convert_title_to_lowercase, split_char='-')

### Finalize structure

In [None]:
hpo_translations_fs = hpo_translations_definitions_removed.copy()

# Impute ontology
hpo_translations_fs['ontologies'] = "HPO_dutch_translation"

# Set name status, see https://github.com/CogStack/MedCAT/blob/master/examples/README.md
hpo_translations_fs['name_status'] = "unknown"
hpo_translations_fs['name_status'][hpo_translations_fs['type'] == 'label'] = 'P'
hpo_translations_fs['name_status'][hpo_translations_fs['type'] == 'synonyms'] = 'A'

# Replace the underscores with colons in the concept ID
hpo_translations_fs['cui'] = hpo_translations_fs['cui'].str.replace('_', ':', regex=True)
hpo_translations_fs = hpo_translations_fs.drop(columns=['id', 'type'])

# Print statistics
print('Number of concepts:', len(hpo_translations_fs['cui'].unique()))
print('Number of names:', len(hpo_translations_fs))

hpo_translations_fs.head()

### Manual corrections

In [None]:
hpo_translations_mc = hpo_translations_fs.copy()

# Correct Respiratory Failure
# Respiratory Insufficiency / C0035229 / 409623005 / HP:0002093 / https://uts.nlm.nih.gov/uts/umls/concept/C0035229
# Respiratory Failure / C1145670 / 409622000 / HP:0002878 / https://uts.nlm.nih.gov/uts/umls/concept/C1145670
print(f"Current HP:0002093\n{hpo_translations_mc.loc[hpo_translations_mc.cui == 'HP:0002093', 'name']}\n")
print(f"Current HP:0002878\n{hpo_translations_mc.loc[hpo_translations_mc.cui == 'HP:0002878', 'name']}\n")
hpo_translations_mc.loc[hpo_translations_mc.cui == 'HP:0002878', 'name'] = 'respiratoir falen'
print(f"Corrected HP:0002878\n{hpo_translations_mc.loc[hpo_translations_mc.cui == 'HP:0002878', 'name']}\n")

## 2. Names from Dutch UMLS and SNOMED

In [None]:
# Read UMLS concept table
umls_concepts = pd.read_csv(umls_concept_table_path, dtype=str, usecols=['cui', 'name', 'ontologies'])

In [None]:
# Credentials to connect to UMLS MySQL hpo_translationsbase
load_dotenv()
user = os.getenv('MYSQL_USER')
password = os.getenv('MYSQL_PASSWORD')
host = os.getenv('MYSQL_HOST')
port = os.getenv('MYSQL_PORT')
hpo_translationsbase = os.getenv('MYSQL_DATABASE')

# Create the connection
connection_string = f'mysql://{user}:{password}@{host}:{port}/{hpo_translationsbase}'
connection = create_engine(connection_string)

In [None]:
# Query to retrieve UMLS to HPO mapping
query = "SELECT DISTINCT CUI, CODE FROM MRCONSO WHERE SAB = 'HPO'"
umls_hpo_mapping = pd.read_sql_query(query, con=connection)
print(f'HPO concepts with UMLS CUI: {len(umls_hpo_mapping)}')
umls_hpo_mapping.head()

### UMLS concepts maps that map to multiple HPO concepts
There are a few UMLS concepts that map to multiple HPO concepts. This pipeline does not add names from these UMLS concepts to HPO, because it will cause ambiguity.

In [None]:
# List a few examples
print(f'UMLS concepts that map to multiple HPO concepts: {len(umls_hpo_mapping[umls_hpo_mapping.CUI.duplicated()])}')
umls_hpo_mapping[umls_hpo_mapping.CUI.duplicated(keep=False)].sort_values(['CUI']).head()

### Multiple UMLS concepts map to single HPO concepts


In [None]:
print(f'Multiple UMLS concepts that map to a single HPO concept: {len(umls_hpo_mapping[umls_hpo_mapping.CODE.duplicated()])}')
umls_hpo_mapping[umls_hpo_mapping.CODE.duplicated(keep=False)].sort_values(['CODE']).head()

In [None]:
# Remove records of UMLS concepts that map to multiple HPO concepts.
# Keep records of multiple UMLS concepts that map to a single HPO concept in.
umls_hpo_mapping = umls_hpo_mapping[~umls_hpo_mapping.CUI.duplicated(keep=False)]

In [None]:
# Use primary names from translations
umls_concepts['name_status'] = 'A'

# Merge UMLS names with HPO-UMLS mapping
umls_hpo_concepts = umls_hpo_mapping.merge(umls_concepts, left_on='CUI', right_on='cui', how='inner')
umls_hpo_concepts.drop(['CUI', 'cui'], axis=1, inplace=True)
umls_hpo_concepts.rename(columns={'CODE': 'cui'}, inplace=True)
umls_hpo_concepts.head()

### Merge translations and UMLS/SNOMED names

In [None]:
# Names from translations
number_cui_translations = len(hpo_translations_fs.cui.unique())
number_names_translations = len(hpo_translations_fs)
print(f'Number of concepts from translations: {number_cui_translations}')
print(f'Number of names from translations: {number_names_translations}')

In [None]:
# Names from UMLS and SNOMED
number_cui_umls = len(umls_hpo_concepts.cui.unique())
number_names_umls = len(umls_hpo_concepts)
print(f'Number of concepts from UMLS and SNOMED: {number_cui_umls}')
print(f'Number of names from UMLS: {number_names_umls}')

In [None]:
# Merge tables
hpo_merged = pd.concat([hpo_translations_fs, umls_hpo_concepts])
hpo_merged = hpo_merged.groupby(['cui', 'name'], as_index=False).agg({'ontologies' : '|'.join, 'name_status' : '|'.join}).copy()

# Clean name_status column
hpo_merged.name_status = hpo_merged.name_status.apply(clean_name_status_column)
hpo_merged.sort_values(by=['cui', 'name_status'], ascending=[True, False], inplace=True)
hpo_merged.reset_index(drop=True,inplace=True)

# Print statistics
number_cui_merged = len(hpo_merged.cui.unique())
number_names_merged = len(hpo_merged)
print(f'Number of concepts: {number_cui_merged} (+{number_cui_merged - number_cui_translations})')
print(f'Number of names: {number_names_merged} (+{number_names_merged - number_names_translations})')
hpo_merged.head()

## 3. Manual UMCU additions

In [None]:
hpo_umcu_names = pd.read_csv(hpo_umcu_names_file, dtype='str')

# Clean table
hpo_umcu_names['name'] = hpo_umcu_names['name'].apply(convert_title_to_lowercase, split_char=' ')
hpo_umcu_names['name'] = hpo_umcu_names['name'].apply(convert_title_to_lowercase, split_char='-')
hpo_umcu_names['ontologies'] = 'UMCU'
hpo_umcu_names['name_status'] = 'A'
print('Number of concepts: ', len(hpo_umcu_names.cui.unique()))
print('Number of names: ', len(hpo_umcu_names))
hpo_umcu_names.head()

In [None]:
# Add extra names
hpo_merged = pd.concat([hpo_merged, hpo_umcu_names])
hpo_merged = hpo_merged.groupby(['cui', 'name', 'name_status']).agg({'ontologies': lambda x: "|".join(x)}).sort_values(['cui', 'name_status', 'name'], ascending=[True, False, True]).reset_index()

number_cui_merged_2 = len(hpo_merged.cui.unique())
number_names_merged_2 = len(hpo_merged)
print(f'Number of concepts: {number_cui_merged_2} (+{number_cui_merged_2 - number_cui_merged})')
print(f'Number of names: {number_names_merged_2} (+{number_names_merged_2 - number_names_merged})')
hpo_merged.head()

## 4. Removal of UMCU blacklisted names

In [None]:
blacklisted_names = pd.read_csv(blacklisted_names_file, dtype='str')
blacklisted_names.head()

In [None]:
hpo_merged = hpo_merged[~hpo_merged['name'].str.lower().isin([x.lower() for x in blacklisted_names.name.tolist()])]
number_cui_merged_3 = len(hpo_merged.cui.unique())
number_names_merged_3 = len(hpo_merged)
print(f'Number of concepts: {number_cui_merged_3} (-{abs(number_cui_merged_3 - number_cui_merged_2)})')
print(f'Number of names: {number_names_merged_3} (-{abs(number_names_merged_3 - number_names_merged_2)})')

## Write output

In [None]:
hpo_merged.to_csv(output_file, index=False)