# Dutch SNOMED to concept table
This notebook describes how to create a SNOMED concept table containing Dutch names, to be used in a named entity recognition and linking tool such as MedCAT. 

In [2]:
import json
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path

# Input
nl_terms = Path('01_Download/SnomedCT_ManagedServiceNL_PRODUCTION_NL1000146_20230331T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-nl_NL1000146_20230331.txt')

# Output
output_file_name = '04_ConceptDB/snomedct-dutch_v1.3.csv'
output_file_name_unfiltered = '04_ConceptDB/snomedct-dutch_v1.3-unfiltered.csv'

## Sources files

### SNOMED September 2020 release
| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2263140 | 736619 | Include international SNOMED |
| Extension | 770016 | 736619 | Some terms are only in English |
| Patient Friendly | 1437 | 1437 | Small but potentially useful list of synonyms |

### SNOMED March 2021 release
| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2422738 | 880806 | Include international SNOMED |
| Extension | 916553 | 880806 | Some terms are only in English |
| Patient Friendly | 2004 | 2004 | Small but potentially useful list of synonyms |

### SNOMED September 2021 release

| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2469845 | 910228 | Include international SNOMED |
| Extension | 948571 | 910228 | Some terms are only in English |
| Patient Friendly | 2385 | 2385 | Small but potentially useful list of synonyms |


### SNOMED September 2022 release
To count total number of records in the SNOMED files, use `wc -l` and substract 1 for header.
```bash
wc -l SnomedCT_*/Snapshot/Terminology/sct2_Description_*.txt
```

| File | Language | NL names |
| - | - | - |
| sct2_Description_Snapshot-en_NL1000146_20220930.txt | English | 1611131 |
| sct2_Description_Snapshot-nl_NL1000146_20220930.txt | Dutch | 945292 |


### Description type
The SNOMED description table contains 3 types of records:

|Type id|Term|
|-|-|
|900000000000003001|Fully specified name (FSN)|
|900000000000013009|Synonym|
|900000000000550004|Definition|

The the purpose of creating a list of names for entity recognation, terms must be:
- FSN or Synonym
- Active

In [3]:
def parse_file(filename):
    with open(filename, encoding='utf-8') as f:
        entities = [[n.strip() for n in line.split('\t')] for line in f]
        return pd.DataFrame(entities[1:], columns=entities[0])

In [4]:
# Select active terms
df_terms = parse_file(nl_terms)
df_terms_active = df_terms.loc[df_terms.active == '1'].copy()
df_terms_inactive = df_terms.loc[df_terms.active != '1'].copy()
print(f'Active terms: {df_terms_active.shape[0]}')
print(f'Inactive terms: {df_terms_inactive.shape[0]}\n')

# Extract fully specified names
df_fsn = df_terms_active[(df_terms_active.typeId == '900000000000003001')].copy()
print(f'Active FSN: {df_fsn.shape[0]}')

# Extract synonyms
df_terms_synonyms = df_terms_active[(df_terms_active.typeId == '900000000000013009')].copy()
print(f'Active Synonyms: {df_terms_synonyms.shape[0]}\n')

Active terms: 911846
Inactive terms: 75611

Active FSN: 287974
Active Synonyms: 623872



### Primary concepts

In [5]:
df_primary_concepts = df_fsn.copy()
df_primary_concepts['tui'] = df_primary_concepts['term'].str.extract(r"\(([^)]*)\)[^(]*$")
df_primary_concepts['str'] = df_primary_concepts['term'].str.extract(r"(^[^\(]+)")
df_primary_concepts['str'] = df_primary_concepts['str'].str.strip()
df_primary_concepts = df_primary_concepts[['conceptId', 'typeId', 'tui', 'str']]

# Create CUI-TUI mapping
cui_tui_mapping = dict(zip(df_primary_concepts.conceptId, df_primary_concepts.tui))
df_primary_concepts.head()

Unnamed: 0,conceptId,typeId,tui,str
8689,716203000,900000000000003001,aandoening,gedecompenseerde levercirrose
8692,33064008,900000000000003001,aandoening,aandoening van sclera
8694,83921000119106,900000000000003001,aandoening,complex regionaal pijnsyndroom type 1 van hoof...
8696,301000146105,900000000000003001,verrichting,fundus autofluorescentie
8698,284196006,900000000000003001,aandoening,brandwond van huid


### Synonyms

In [6]:
# Clean synonym table
df_synonyms = df_terms_synonyms.copy()
df_synonyms = df_synonyms[['conceptId', 'term', 'typeId']]
df_synonyms.rename({'term': 'str'}, inplace=True, axis=1)

# Add TUI to synonyms
df_synonyms['tui'] = df_synonyms.conceptId.map(cui_tui_mapping)
# df_synonyms[df_synonyms['tui'].isna()].shape
# 67 synonyms without type

df_synonyms.head()

Unnamed: 0,conceptId,str,typeId,tui
1,72124005,afwijkende pupilfunctie,900000000000013009,aandoening
3,232140004,accommodatieve insufficiëntie,900000000000013009,aandoening
7,31053003,aandoening van traanapparaat,900000000000013009,aandoening
8,70992005,anterieure synechiae,900000000000013009,aandoening
9,410692006,anterieure uveïtis,900000000000013009,aandoening


### Combined

In [7]:
# Create combined 
df_all = pd.concat([df_primary_concepts, df_synonyms]).reset_index(drop=True)
df_all.rename({'typeId': 'tty', 'conceptId': 'cui'}, inplace=True, axis=1)

# Map to MedCAT's P (Preferred term) & A values
# See https://github.com/CogStack/MedCAT/blob/master/examples/README.md
df_all.tty.replace({'900000000000003001': 'P',
                    '900000000000013009': 'A'}, inplace=True)

# Use convention in UMLS where default English SNOMED is called SNOMEDCT_US
df_all['sab'] = 'SNOMEDCT_NL'
df_all = df_all[['cui', 'str', 'tty', 'tui', 'sab']]

# Drop synonyms that are the same as primary name
print(f'Records before dropping duplicates: {df_all.shape[0]}')
df_all_unique = df_all.drop(df_all[(df_all.duplicated(subset=['cui', 'str', 'tui'], keep=False)) & (df_all.tty=='A')].index)
print(f'Records after dropping duplicates: {df_all_unique.shape[0]}')

# Sort column on cui and tty
df_all_unique['cui'] = df_all_unique['cui'].astype(int)
df_all_unique.sort_values(['cui', 'tty'], ascending=[True, False], inplace=True)

df_all_unique.head(25)

Records before dropping duplicates: 911846
Records after dropping duplicates: 624446


Unnamed: 0,cui,str,tty,tui,sab
134310,103007,eekhoorn-fibroomvirus,P,organisme,SNOMEDCT_NL
66207,104001,excisie van afwijkend weefsel van patella,P,verrichting,SNOMEDCT_NL
419005,104001,excisie van laesie van knieschijf,A,verrichting,SNOMEDCT_NL
28611,106004,structuur van posterieure carpale regio,P,lichaamsstructuur,SNOMEDCT_NL
335161,106004,posterieur gebied van handwortel,A,lichaamsstructuur,SNOMEDCT_NL
335162,106004,posterieur carpaal gebied,A,lichaamsstructuur,SNOMEDCT_NL
122840,107008,structuur van pars foetalis placentae,P,lichaamsstructuur,SNOMEDCT_NL
534082,107008,foetaal deel van placenta,A,lichaamsstructuur,SNOMEDCT_NL
549892,107008,pars foetalis placentae,A,lichaamsstructuur,SNOMEDCT_NL
84799,108003,gehele vena emissaria condylaris,P,lichaamsstructuur,SNOMEDCT_NL


In [8]:
# A few rows contain NaN
# Easiest way to deal with it is drop them. 
display(df_all_unique[df_all_unique.isnull().any(axis=1)].head())
print(len(df_all_unique))
df_all_unique.dropna(inplace=True)
print(len(df_all_unique))

Unnamed: 0,cui,str,tty,tui,sab
837977,35471006,Curvularia spicifera,A,,SNOMEDCT_NL
881462,35471006,Conchiobolus spicifer,A,,SNOMEDCT_NL
288592,51823001,Hydrogenibacillus schlegelii,A,,SNOMEDCT_NL
902200,58000006,ontslag van patiënt (verrichting),A,,SNOMEDCT_NL
902201,58000006,ontslag van patiënt,A,,SNOMEDCT_NL


624446
624398


### Examples

In [9]:
df_all_unique[df_all_unique.str == 'ALS']

Unnamed: 0,cui,str,tty,tui,sab
532859,86044005,ALS,A,aandoening,SNOMEDCT_NL


In [10]:
df_all_unique[df_all_unique.cui == 86044005]

Unnamed: 0,cui,str,tty,tui,sab
20916,86044005,amyotrofische laterale sclerose,P,aandoening,SNOMEDCT_NL
532859,86044005,ALS,A,aandoening,SNOMEDCT_NL


In [11]:
df_all_unique[df_all_unique.str == 'longkanker']

Unnamed: 0,cui,str,tty,tui,sab
489886,93880001,longkanker,A,aandoening,SNOMEDCT_NL


In [12]:
df_all_unique[df_all_unique.cui == 93880001]

Unnamed: 0,cui,str,tty,tui,sab
37353,93880001,primair maligne neoplasma van long,P,aandoening,SNOMEDCT_NL
489886,93880001,longkanker,A,aandoening,SNOMEDCT_NL


In [13]:
df_all_unique[df_all_unique.cui == 22298006]

Unnamed: 0,cui,str,tty,tui,sab
11184,22298006,myocardinfarct,P,aandoening,SNOMEDCT_NL
309564,22298006,hartinfarct,A,aandoening,SNOMEDCT_NL
309565,22298006,MI,A,aandoening,SNOMEDCT_NL
309566,22298006,hartaanval,A,aandoening,SNOMEDCT_NL


In [14]:
df_all_unique[df_all_unique.str == 'methotrexaat']

Unnamed: 0,cui,str,tty,tui,sab
107980,387381009,methotrexaat,P,substantie,SNOMEDCT_NL


In [15]:
df_all_unique[df_all_unique.cui == 387381009]

Unnamed: 0,cui,str,tty,tui,sab
107980,387381009,methotrexaat,P,substantie,SNOMEDCT_NL
518119,387381009,MTX,A,substantie,SNOMEDCT_NL


## Evaluation of SNOMED types

To select the types relevant for named entity linking, we assessed the performance of a MedCAT model on a set of example documents using the unfiltered SNOMED terms. We noticed some types are not useful for our general purpose (named entity recognition), and introduce false positives and ambiguity. We exclude the less useful types from our concept table.

In [16]:
df_all_unique.tui.value_counts()

tui
aandoening                       200406
verrichting                      125668
lichaamsstructuur                102603
bevinding                         68631
organisme                         32822
fysiek object                     21447
substantie                        13155
situatie                          12524
afwijkende morfologie             10679
waarneembare entiteit              6380
regime/therapie                    5725
kwalificatiewaarde                 4373
monster                            4025
gebeurtenis                        3695
beroep                             3510
omgeving                           1664
cel                                1618
celstructuur                       1116
farmaceutisch product               987
persoon                             709
eigenschap                          565
metadata                            556
gegevensobject                      507
fysische kracht                     286
religie/filosofie                   

In [20]:
df_all_unique

Unnamed: 0,cui,str,tty,tui,sab
134310,103007,eekhoorn-fibroomvirus,P,organisme,SNOMEDCT_NL
66207,104001,excisie van afwijkend weefsel van patella,P,verrichting,SNOMEDCT_NL
419005,104001,excisie van laesie van knieschijf,A,verrichting,SNOMEDCT_NL
28611,106004,structuur van posterieure carpale regio,P,lichaamsstructuur,SNOMEDCT_NL
335161,106004,posterieur gebied van handwortel,A,lichaamsstructuur,SNOMEDCT_NL
...,...,...,...,...,...
850293,987840791000119102,DADA2,A,aandoening,SNOMEDCT_NL
196371,989065441000087103,Leptotrichia amnionii,P,organisme,SNOMEDCT_NL
136674,998010041000087101,Legionella hackeliae serogroep 2,P,organisme,SNOMEDCT_NL
196335,999480551000087103,Aspergillus japonicus,P,organisme,SNOMEDCT_NL


| tui | usefulness for NER | useful examples for NER | useless examples for NER |
| :- | :- | :- | :-|
|aandoening |good|hypertensie, boezemfibrilleren, av-blok| |
|monster|good|trombocyten, leukocyten,basofiele granulocyten||
|regime/therapie|good|fysiotherapie, hartrevalidatie, therapie||
|waarneembare entities|good|leeftijd, bloeddruk, hartas, LVEF||
|bevinding|good|koorts, zwelling, tachycardie||
|attribuut|good|bij, na||
|kwalificatiewaarde|good|ontslag, beloop, gestaakt, geen||
|afwijkende morfologie|good|thermisch letsel, blaar, luxatie||
|cel|good, but rare|erythrocytes||
|gegevensobject|good, but rare| Echocardiogram, operatieverslag||
|sociaal concept|good, but rare|familie||
|situatie|good, but includes negation|geen pijn, geen dispneu, geen hoesten||
|verrichting |decent|lokale anesthesie, lichamelijk onderzoek, palpatie|erg (elektroretinografie), weken, post (peritoneale transfer van eicel en sperma), beleid (management)|
|substantie|decent|nebivolol, amlodipine, ceftriaxon|wortel, PM (fijnstof)|
|omgeving|decent|ziekenhuis, polikliniek, huis, afdeling cardiologie, afdeling fysiologie|meer, stroop, plaats|
|lichaamsstructuur|decent|pols, aortaklep, AV-knoop|mid (mesioincisodistale vlakken van gebitselement)|
|persoon|decent|patient, dochter, vader|bekende (kennis)|
|fysische kracht|decent, but rare|druk||
|fysiek object|discutable, but rare|pacemaker|verband|
|beroep|bad||rechter, belang (behanger), herkende (werkende)|
|metadata|bad||beeld|
|gebeurtenis|bad, only 1 term matched||het (hoog energetisch trauma)|

Terms that were never found: omgeving/locatie, organisme, physical object: fout, religie/filosofie, product, disorder, navigatieconcept, lifestyle, procedure, gradering, tumorgradering, beoordelingsschaal, inactief concept, speciaal concept, ras, foundation metadata concept, physical object, eigenschap, celstructuur

In [17]:
# Total number of concepts
df_all_unique.shape[0]

624398

In [18]:
# Number of primary concepts
df_all_unique[df_all_unique.tty == 'P'].shape

(287973, 5)

In [19]:
# Number of synonyms
df_all_unique[df_all_unique.tty == 'A'].shape

(336425, 5)

In [21]:
# Define list of relevant terms for general use
relevant_tuis = ['aandoening',
                'monster',
                'regime/therapie',
                'bevinding',
                'afwijkende morfologie',
                'cel',
                'gegevensobject',
                'verrichting',
                'substantie',
                'lichaamsstructuur']

In [22]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].tui.value_counts()

tui
aandoening               200406
verrichting              125668
lichaamsstructuur        102603
bevinding                 68631
substantie                13155
afwijkende morfologie     10679
regime/therapie            5725
monster                    4025
cel                        1618
gegevensobject              507
Name: count, dtype: int64

In [23]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].shape[0]

533017

## Output

In [24]:
# Unfiltered concepts
print(f'Number of concepts: {len(df_all_unique.cui.unique())}')
print(f'Number of names: {len(df_all_unique)}')
df_all_unique.to_csv(output_file_name_unfiltered, index=False)

Number of concepts: 287974
Number of names: 624398


In [25]:
# Filtered concepts
filtered_concepts = df_all_unique[df_all_unique.tui.isin(relevant_tuis)]
print(f'Number of concepts: {len(filtered_concepts.cui.unique())}')
print(f'Number of names: {len(filtered_concepts)}')
filtered_concepts.to_csv(output_file_name, index=False)

Number of concepts: 235210
Number of names: 533017
