In [1]:
import duckdb
import yaml
import umls_api
from tqdm import tqdm
import pandas as pd



In [2]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

umls_token = PARAM["umls_token"]

In [3]:
conn = duckdb.connect()
# conn.sql("install duckpgq from community;")
# conn.sql("load duckpgq;")
# print(duckdb.__version__)

In [4]:
# conn.sql("install ducklake;")
# conn.sql("load ducklake;")

In [5]:
conn.sql("install postgres;")
conn.sql("LOAD postgres;")

In [6]:
conn.sql(f"""
ATTACH 'host=192.168.1.29 port=5432 user={PARAM["omop_postgres_user"]} password={PARAM["omop_postgres_password"]} dbname=postgres' AS broadsea_db (TYPE POSTGRES);
""")

In [7]:


snomed_mapping_df = conn.sql("""
SELECT DISTINCT 
    ce.condition_concept_id, 
    c.concept_code AS snomed_id,
    c.concept_name
FROM broadsea_db.demo_cdm.condition_era ce
JOIN broadsea_db.demo_cdm.concept c 
  ON ce.condition_concept_id = c.concept_id
WHERE c.vocabulary_id = 'SNOMED'
""").df()

print(snomed_mapping_df.head())

   condition_concept_id  snomed_id                          concept_name
0               4155034  283371005                 Laceration of forearm
1                261325   87433001                   Pulmonary emphysema
2               4166224   47693006                   Rupture of appendix
3              40479768  444470001  Injury of anterior cruciate ligament
4              40481087  444814009                       Viral sinusitis


In [8]:
len(snomed_mapping_df)

71

In [9]:
umls_api.get_cui_from_subcategory("128613002", "SNOMEDCT_US", umls_token)

{'cui': 'C0014544', 'name': 'Seizure disorder'}

In [10]:
mapping_records = []

for index, row in tqdm(snomed_mapping_df.iterrows(), total=snomed_mapping_df.shape[0]):
  snomed_id = str(row['snomed_id'])
  concept_id = row['condition_concept_id']
  
  res = umls_api.get_cui_from_subcategory(snomed_id, "SNOMEDCT_US", umls_token)
  
  if res and res.get('cui'):
      mapping_records.append({
          'source_code': res['cui'],
          'source_concept_id': 0,
          'source_vocabulary_id': 'UMLS',
          'source_code_description': res['name'],
          'target_concept_id': concept_id,
          'target_vocabulary_id': 'SNOMED',
          'valid_start_date': '1970-01-01',
          'valid_end_date': '2099-12-31',
          'invalid_reason': None
      })

# Convert the results to a temporary DataFrame for easy insertion
upload_df = pd.DataFrame(mapping_records)

100%|██████████| 71/71 [00:58<00:00,  1.22it/s]


In [11]:
upload_df.head()

Unnamed: 0,source_code,source_concept_id,source_vocabulary_id,source_code_description,target_concept_id,target_vocabulary_id,valid_start_date,valid_end_date,invalid_reason
0,C0561248,0,UMLS,Laceration of forearm,4155034,SNOMED,1970-01-01,2099-12-31,
1,C0034067,0,UMLS,Pulmonary emphysema,261325,SNOMED,1970-01-01,2099-12-31,
2,C0267628,0,UMLS,Rupture of appendix,4166224,SNOMED,1970-01-01,2099-12-31,
3,C1456574,0,UMLS,Injury of anterior cruciate ligament,40479768,SNOMED,1970-01-01,2099-12-31,
4,C0748731,0,UMLS,Viral sinusitis,40481087,SNOMED,1970-01-01,2099-12-31,


In [12]:
upload_df.to_csv("condition_era_umls.tsv", sep="\t", index=False)

In [13]:
conn.sql("""
INSERT INTO broadsea_db.demo_cdm.source_to_concept_map
SELECT * FROM read_csv('condition_era_umls.tsv', delim='\t', header=true)
""")

In [14]:


rxnorm_mapping_df = conn.sql("""
SELECT DISTINCT 
    de.drug_concept_id, 
    c.concept_code AS rxnorm_id,
    c.concept_name
FROM broadsea_db.demo_cdm.drug_era de
JOIN broadsea_db.demo_cdm.concept c 
  ON de.drug_concept_id = c.concept_id
WHERE c.vocabulary_id = 'RxNorm'
""").df()

print(rxnorm_mapping_df.head())

   drug_concept_id rxnorm_id concept_name
0          1539403     36567  Simvastatin
1          1551099      8640   Prednisone
2           738818      3642   Doxylamine
3          1137529     36117   salmeterol
4           757627      4637  Galantamine


In [15]:
umls_api.get_cui_from_subcategory("32968", "RXNORM", umls_token)

{'cui': 'C0070166', 'name': 'clopidogrel'}

In [16]:
mapping_records = []

for index, row in tqdm(rxnorm_mapping_df.iterrows(), total=rxnorm_mapping_df.shape[0]):
  rxnorm_id = str(row['rxnorm_id'])
  concept_id = row['drug_concept_id']
  
  # Call your home-baked function
  res = umls_api.get_cui_from_subcategory(rxnorm_id, "RXNORM", umls_token)
  
  if res and res.get('cui'):
      mapping_records.append({
          'source_code': res['cui'],
          'source_concept_id': 0,
          'source_vocabulary_id': 'UMLS',
          'source_code_description': res['name'],
          'target_concept_id': concept_id,
          'target_vocabulary_id': 'RxNorm',
          'valid_start_date': '1970-01-01',
          'valid_end_date': '2099-12-31',
          'invalid_reason': None
      })

# Convert the results to a temporary DataFrame for easy insertion
upload_df = pd.DataFrame(mapping_records)

100%|██████████| 83/83 [00:58<00:00,  1.41it/s]


In [17]:
upload_df.to_csv("drug_era_umls.tsv", sep="\t", index=False)

conn.sql("""
INSERT INTO broadsea_db.demo_cdm.source_to_concept_map
SELECT * FROM read_csv('drug_era_umls.tsv', delim='\t', header=true)
""")