# Create a MedCAT CDB

In [30]:
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb_maker import CDBMaker
from medcat.config import Config
import os
import pandas as pd

In [31]:
csv_path = './cui_040320201.csv'  # deid terminology path
output_cdb = '20210304_basic_deid_cdb_wtout_names.dat'  # Name of CDB output

## Load /inspect data

In [37]:
csv = pd.read_csv(csv_path)

In [40]:
csv['name_status'] = csv['name_status'].replace("PN", "P")

In [41]:
csv

Unnamed: 0,name,cui,ontologies,name_status,type_ids,description
0,de-identification_root_concept,R0000,cat_anon,P,,root concept of de-identification
1,name,N1000,cat_anon,P,,surname and forename
2,contact_details,C2000,cat_anon,P,,non hospital identification and contact details
3,healthcare_identifier,H3000,cat_anon,P,,hospital derived ID
4,date,D4000,cat_anon,P,,personal dates
5,fore_name,N1100,cat_anon,P,,"given name including middle names (each name, ..."
6,surname,N1200,cat_anon,P,,all surnames
7,address,C2100,cat_anon,P,,address and postcode (including a comma or ful...
8,address_line,C2110,cat_anon,P,,all address line items including city and country
9,postcode,C2120,cat_anon,P,,all postcodes


In [15]:
csv.rename(columns={"str": "name",
                     "cui": "cui",
                     "onto": "ontologies",
                     "tty": "name_status",
                     "tui": "type_ids",
                     "desc": "description"}, inplace=True)

In [16]:
csv.drop(['sty'], axis=1, inplace=True)

In [21]:
csv.to_csv("./cui_040320201.csv", index=False)

In [None]:
csv.dropna(axis=0,subset=['name'], inplace=True)

In [None]:
csv[csv.name.isnull()]

In [42]:
csv.to_csv("/Users/shek/Downloads/full_cui_medcat040221.csv", index=False)

## Create cdb

In [43]:
config = Config()

In [44]:
maker = CDBMaker(config)  

In [57]:
cdb = maker.prepare_csvs([csv_path], full_build=True)

Started importing concepts from: ./cui_040320201.csv
Current progress: 0% at 0.000s per 0 rows
Current progress: 5% at 0.003s per 0 rows
Current progress: 9% at 0.017s per 0 rows
Current progress: 14% at 0.016s per 0 rows
Current progress: 18% at 0.010s per 0 rows
Current progress: 23% at 0.021s per 0 rows
Current progress: 27% at 0.036s per 0 rows
Current progress: 32% at 0.026s per 0 rows
Current progress: 36% at 0.009s per 0 rows
Current progress: 41% at 0.005s per 0 rows
Current progress: 45% at 0.005s per 0 rows
Current progress: 50% at 0.014s per 0 rows
Current progress: 55% at 0.003s per 0 rows
Current progress: 59% at 0.004s per 0 rows
Current progress: 64% at 0.025s per 0 rows
Current progress: 68% at 0.005s per 0 rows
Current progress: 73% at 0.005s per 0 rows
Current progress: 77% at 0.005s per 0 rows
Current progress: 82% at 0.005s per 0 rows
Current progress: 86% at 0.006s per 0 rows
Current progress: 91% at 0.005s per 0 rows
Current progress: 95% at 0.004s per 0 rows


In [49]:
# Remove all synoymns of concepts (only in rare usecases)
cdb.addl_info['cui2original_names'] = {}

In [58]:
cdb.name2cuis

{'de~identification~root~concept': ['R0000'],
 'name': ['N1000'],
 'contact~detail': ['C2000'],
 'contact~details': ['C2000'],
 'healthcare~identifier': ['H3000'],
 'date': ['D4000'],
 'fore~name': ['N1100'],
 'surname': ['N1200'],
 'address': ['C2100'],
 'address~line': ['C2110'],
 'postcode': ['C2120'],
 'telephone~numb': ['C2200'],
 'telephone~number': ['C2200'],
 'email': ['C2300'],
 'identification': ['C2400'],
 'passport~numb': ['C2410'],
 'passport~number': ['C2410'],
 'drive~licence~numb': ['C2420'],
 'driving~licence~number': ['C2420'],
 'national~insurance': ['C2430'],
 'nhs~numb': ['H3100'],
 'nhs~number': ['H3100'],
 'hospital~numb': ['H3200'],
 'hospital~number': ['H3200'],
 'emergency~department~numb': ['H3300'],
 'emergency~department~number': ['H3300'],
 'lab~numb': ['H3400'],
 'lab~number': ['H3400'],
 'date~of~birth': ['H4100'],
 'url': ['C2500']}

In [59]:
csv

Unnamed: 0,name,cui,ontologies,name_status,type_ids,description
0,de-identification_root_concept,R0000,cat_anon,P,,root concept of de-identification
1,name,N1000,cat_anon,P,,surname and forename
2,contact_details,C2000,cat_anon,P,,non hospital identification and contact details
3,healthcare_identifier,H3000,cat_anon,P,,hospital derived ID
4,date,D4000,cat_anon,P,,personal dates
5,fore_name,N1100,cat_anon,P,,"given name including middle names (each name, ..."
6,surname,N1200,cat_anon,P,,all surnames
7,address,C2100,cat_anon,P,,address and postcode (including a comma or ful...
8,address_line,C2110,cat_anon,P,,all address line items including city and country
9,postcode,C2120,cat_anon,P,,all postcodes


In [47]:
cdb.save(output_cdb)

## Load CDB and inspect

In [11]:
# load cdb
cdb2 = CDB.load(output_cdb)

In [12]:
cdb2

<medcat.cdb.CDB at 0x7fd82e5ae4f0>