# Create a MedCAT CDB

In [53]:
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb_maker import CDBMaker
from medcat.config import Config
import os
import pandas as pd

import spacy
import en_core_sci_md

In [15]:
csv_path = 'cui_05082021.csv'  # deid terminology path
output_cdb = '05082021_basic_deid_cdb_wtout_names.dat'  # Name of CDB output

## Load /inspect data

In [16]:
csv = pd.read_csv(csv_path)
csv

Unnamed: 0,str,cui,onto,tty,tui,sty,desc
0,de-identification_root_concept,R0000,cat_anon,PN,,,root concept of de-identification
1,name,N1000,cat_anon,PN,,,surname and forename
2,contact_details,C2000,cat_anon,PN,,,non hospital identification and contact details
3,healthcare_identifier,H3000,cat_anon,PN,,,hospital derived ID
4,date,D4000,cat_anon,PN,,,personal dates
5,fore_name,N1100,cat_anon,PN,,,"given name including middle names (each name, ..."
6,surname,N1200,cat_anon,PN,,,all surnames
7,initials,N1300,cat_anon,PN,,,all initials (initials that aren't seperated b...
8,address,C2100,cat_anon,PN,,,address and postcode (including a comma or ful...
9,address_line,C2110,cat_anon,PN,,,all address line items including city and country


In [17]:
csv

Unnamed: 0,str,cui,onto,tty,tui,sty,desc
0,de-identification_root_concept,R0000,cat_anon,PN,,,root concept of de-identification
1,name,N1000,cat_anon,PN,,,surname and forename
2,contact_details,C2000,cat_anon,PN,,,non hospital identification and contact details
3,healthcare_identifier,H3000,cat_anon,PN,,,hospital derived ID
4,date,D4000,cat_anon,PN,,,personal dates
5,fore_name,N1100,cat_anon,PN,,,"given name including middle names (each name, ..."
6,surname,N1200,cat_anon,PN,,,all surnames
7,initials,N1300,cat_anon,PN,,,all initials (initials that aren't seperated b...
8,address,C2100,cat_anon,PN,,,address and postcode (including a comma or ful...
9,address_line,C2110,cat_anon,PN,,,all address line items including city and country


In [18]:
csv.drop(['sty'], axis=1, inplace=True)

In [19]:
csv.rename(columns={"str": "name",
                     "cui": "cui",
                     "onto": "ontologies",
                     "tty": "name_status",
                     "tui": "type_ids",
                     "desc": "description"}, inplace=True)

In [20]:
csv['name_status'] = csv['name_status'].replace("PN", "P")

In [25]:
csv

Unnamed: 0,name,cui,ontologies,name_status,type_ids,description
0,de-identification_root_concept,R0000,cat_anon,P,,root concept of de-identification
1,name,N1000,cat_anon,P,,surname and forename
2,contact_details,C2000,cat_anon,P,,non hospital identification and contact details
3,healthcare_identifier,H3000,cat_anon,P,,hospital derived ID
4,date,D4000,cat_anon,P,,personal dates
5,fore_name,N1100,cat_anon,P,,"given name including middle names (each name, ..."
6,surname,N1200,cat_anon,P,,all surnames
7,initials,N1300,cat_anon,P,,all initials (initials that aren't seperated b...
8,address,C2100,cat_anon,P,,address and postcode (including a comma or ful...
9,address_line,C2110,cat_anon,P,,all address line items including city and country


In [26]:
csv.to_csv("./cui_05082021.csv", index=False)

In [23]:
csv.dropna(axis=0,subset=['name'], inplace=True)

In [24]:
csv[csv.name.isnull()]

Unnamed: 0,name,cui,ontologies,name_status,type_ids,description


In [42]:
csv.to_csv("/Users/shek/Downloads/full_cui_medcat040221.csv", index=False)

## Create cdb

In [27]:
config = Config()

In [54]:
maker = CDBMaker(config)  

In [55]:
cdb = maker.prepare_csvs([csv_path], full_build=True)

Started importing concepts from: cui_05082021.csv
Current progress: 0% at 0.000s per 0 rows
Current progress: 4% at 0.277s per 0 rows
Current progress: 8% at 0.006s per 0 rows
Current progress: 12% at 0.010s per 0 rows
Current progress: 17% at 0.009s per 0 rows
Current progress: 21% at 0.007s per 0 rows
Current progress: 25% at 0.013s per 0 rows
Current progress: 29% at 0.011s per 0 rows
Current progress: 33% at 0.013s per 0 rows
Current progress: 38% at 0.008s per 0 rows
Current progress: 42% at 0.011s per 0 rows
Current progress: 46% at 0.010s per 0 rows
Current progress: 50% at 0.010s per 0 rows
Current progress: 54% at 0.011s per 0 rows
Current progress: 58% at 0.010s per 0 rows
Current progress: 62% at 0.010s per 0 rows
Current progress: 67% at 0.010s per 0 rows
Current progress: 71% at 0.014s per 0 rows
Current progress: 75% at 0.011s per 0 rows
Current progress: 79% at 0.009s per 0 rows
Current progress: 83% at 0.013s per 0 rows
Current progress: 88% at 0.009s per 0 rows
Current

In [49]:
# Remove all synoymns of concepts (only in rare usecases)
# cdb.addl_info['cui2original_names'] = {}

In [56]:
cdb.name2cuis

{'de~identification~root~concept': ['R0000'],
 'name': ['N1000'],
 'contact~details': ['C2000'],
 'contact~detail': ['C2000'],
 'healthcare~identifier': ['H3000'],
 'date': ['D4000'],
 'fore~name': ['N1100'],
 'surname': ['N1200'],
 'initials': ['N1300'],
 'initial': ['N1300'],
 'address': ['C2100'],
 'address~line': ['C2110'],
 'postcode': ['C2120'],
 'telephone~number': ['C2200'],
 'email': ['C2300'],
 'identification': ['C2400'],
 'passport~number': ['C2410'],
 'driving~licence~number': ['C2420'],
 'national~insurance': ['C2430'],
 'nhs~number': ['H3100'],
 'hospital~number': ['H3200'],
 'emergency~department~number': ['H3300'],
 'lab~number': ['H3400'],
 'gmc~number': ['H3500'],
 'date~of~birth': ['H4100'],
 'url': ['C2500']}

In [57]:
csv

Unnamed: 0,name,cui,ontologies,name_status,type_ids,description
0,de-identification_root_concept,R0000,cat_anon,P,,root concept of de-identification
1,name,N1000,cat_anon,P,,surname and forename
2,contact_details,C2000,cat_anon,P,,non hospital identification and contact details
3,healthcare_identifier,H3000,cat_anon,P,,hospital derived ID
4,date,D4000,cat_anon,P,,personal dates
5,fore_name,N1100,cat_anon,P,,"given name including middle names (each name, ..."
6,surname,N1200,cat_anon,P,,all surnames
7,initials,N1300,cat_anon,P,,all initials (initials that aren't seperated b...
8,address,C2100,cat_anon,P,,address and postcode (including a comma or ful...
9,address_line,C2110,cat_anon,P,,all address line items including city and country


In [58]:
cdb.save(output_cdb)

## Load CDB and inspect

In [59]:
# load cdb
cdb2 = CDB.load(output_cdb)

In [60]:
cdb2

<medcat.cdb.CDB at 0x7f94ca5e03a0>