## UMLS-Graph Extract Scripts
#### Assumes accessible Oracle database of UMLS Metathesauras and Semantic Network
#### Generates a set of CSV files for ingest into neo4j

In [1]:
import sys
import numpy as np
import pandas as pd
import cx_Oracle
import sqlalchemy

#### Establish a connection - hide password in same directory as this notebook (e.g. in ~/work/Jupyter/)
conn_string.txt file contains one line like: oracle+cx_oracle://user:pass@server-address:port/database

In [19]:
_ = open('conn_string.txt', 'r'); conn_string = _.read().replace('\n',''); _.close()
engine = sqlalchemy.create_engine(conn_string, arraysize=100000, max_identifier_length=128)

#### Set the UMLSversion (in Pitt/Neptune implementation this is the Oracle Schema/User)


In [4]:
UMLSversion = 'UMLS2020AB'

### TUIs.csv

In [5]:
query = "SELECT DISTINCT UI, STY_RL, STN_RTN, DEF FROM {0}.SRDEF WHERE RT = 'STY'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =['TUI:ID', 'name', 'STN', 'DEF']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/TUIs.csv', header=True, index=False)
df.tail()

Unnamed: 0,TUI:ID,name,STN,DEF
122,T072,Physical Object,A1,An object perceptible to the sense of vision o...
123,T114,"Nucleic Acid, Nucleoside, or Nucleotide",A1.4.1.2.1.5,A complex compound of high molecular weight oc...
124,T171,Language,A2.5,The system of communication used by a particul...
125,T195,Antibiotic,A1.4.1.1.1.1,A pharmacologically active compound produced b...
126,T201,Clinical Attribute,A2.3.1,An observable or measurable property or state ...


### TUIrel.csv

In [6]:
query = "WITH Semantics as (SELECT DISTINCT UI from {0}.SRDEF WHERE RT = 'STY') SELECT DISTINCT UI3, UI1 FROM {0}.SRSTRE1 INNER JOIN Semantics ON {0}.SRSTRE1.UI1 = Semantics.UI WHERE UI2 = 'T186'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':END_ID', ':START_ID']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/TUIrel.csv', header=True, index=False)
df.tail()

Unnamed: 0,:END_ID,:START_ID
459,T051,T191
460,T071,T194
461,T121,T195
462,T032,T201
463,T071,T204


### CUIs.csv

In [7]:
query = "SELECT DISTINCT CUI from {0}.MRCONSO where {0}.MRCONSO.ISPREF = 'Y' AND {0}.MRCONSO.STT = 'PF' AND {0}.MRCONSO.TS = 'P' and {0}.MRCONSO.LAT = 'ENG'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =['CUI:ID']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/CUIs.csv', header=True, index=False)
df.tail()

Unnamed: 0,CUI:ID
4362832,C5399565
4362833,C5399589
4362834,C5399671
4362835,C5399708
4362836,C5399731


### CUI-TUIs.csv

In [8]:
query = "SELECT DISTINCT CUI, TUI FROM {0}.MRSTY".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':START_ID',':END_ID']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/CUI-TUIs.csv', header=True, index=False)
df.tail()

Unnamed: 0,:START_ID,:END_ID
4681732,C5399609,T074
4681733,C5399617,T074
4681734,C5399694,T170
4681735,C5399712,T121
4681736,C5399737,T121


### CUI-CUIs.csv

In [9]:
query = "WITH SABlist as (SELECT DISTINCT SAB from {0}.MRCONSO where {0}.MRCONSO.LAT = 'ENG') SELECT DISTINCT CUI2, CUI1, NVL(RELA, REL), {0}.MRREL.SAB from {0}.MRREL inner join SABlist on {0}.MRREL.SAB = SABlist.SAB where {0}.MRREL.SUPPRESS <> 'O' and CUI1 <> CUI2 and REL <> 'SIB'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':START_ID',':END_ID',':TYPE','SAB']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/CUI-CUIs.csv', header=True, index=False)
df.tail()

Unnamed: 0,:START_ID,:END_ID,:TYPE,SAB
24274665,C3643466,C5399718,CHD,MED-RT
24274666,C2955435,C5399721,CHD,MED-RT
24274667,C3205593,C5399721,CHD,MED-RT
24274668,C3500481,C5399727,CHD,MED-RT
24274669,C3536831,C5399733,CHD,MED-RT


### CODEs.csv

In [16]:
query = "With CUIlist as (SELECT DISTINCT CUI from {0}.MRCONSO where {0}.MRCONSO.ISPREF = 'Y' AND {0}.MRCONSO.STT = 'PF' AND {0}.MRCONSO.TS = 'P' and {0}.MRCONSO.LAT = 'ENG') SELECT DISTINCT ({0}.MRCONSO.SAB||' '||{0}.MRCONSO.CODE), {0}.MRCONSO.SAB, {0}.MRCONSO.CODE from {0}.MRCONSO inner join CUIlist on {0}.MRCONSO.CUI = CUIlist.CUI where {0}.MRCONSO.LAT = 'ENG' and SUPPRESS <> 'O'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =['CodeID:ID','SAB','CODE']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/CODEs.csv', header=True, index=False)
df.tail()

Unnamed: 0,CodeID:ID,SAB,CODE
5035842,MTHSPL 78562-013,MTHSPL,78562-013
5035843,MTHSPL 80208-111,MTHSPL,80208-111
5035844,VANDF 4039244,VANDF,4039244
5035845,MED-RT N0000193903,MED-RT,N0000193903
5035846,MED-RT N0000193884,MED-RT,N0000193884


### CUI-CODEs.csv

In [15]:
query = "SELECT DISTINCT CUI, (SAB||' '||CODE) FROM {0}.MRCONSO WHERE LAT = 'ENG' AND SUPPRESS <> 'O'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':START_ID',':END_ID']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/CUI-CODEs.csv', header=True, index=False)
df.tail()

Unnamed: 0,:START_ID,:END_ID
5589045,C5399385,RXNORM 2387354
5589046,C5399429,RXNORM 2390649
5589047,C5399557,RXNORM 2390643
5589048,C5399696,MTH NOCODE
5589049,C5399706,MTH NOCODE


### SUIs.csv

In [17]:
query = "SELECT DISTINCT {0}.MRCONSO.SUI, {0}.MRCONSO.STR FROM {0}.MRCONSO WHERE {0}.MRCONSO.LAT = 'ENG'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =['SUI:ID','name']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/SUIs.csv', header=True, index=False)
df.tail()

Unnamed: 0,SUI:ID,name
9249162,S20180044,"Lotion, Multi Ingredient Topical application L..."
9249163,S20177672,ALCOHOL 63 kg in 100 kg TOPICAL GEL [Triton Ha...
9249164,S20184221,vancomycin 1.75 GM in 350 ML Injection
9249165,S20180056,MALUS DOMESTICA LEAF
9249166,S20181395,acetaminophen 325 MG / phenyltoloxamine citrat...


### CODE-SUIs.csv

In [20]:
query = "SELECT DISTINCT SUI, (SAB||' '||CODE), TTY, CUI FROM {0}.MRCONSO WHERE LAT = 'ENG' AND SUPPRESS <> 'O'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':END_ID',':START_ID',':TYPE','CUI']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/CODE-SUIs.csv', header=True, index=False)
df.tail()

Unnamed: 0,:END_ID,:START_ID,:TYPE,CUI
9672895,S20184101,RXNORM 2390940,SY,C5399379
9672896,S20178868,VANDF 4039290,CD,C5399590
9672897,S20180934,VANDF 4039516,CD,C5399648
9672898,S20180974,VANDF 4039569,AB,C5399655
9672899,S20184636,MED-RT N0000193909,FN,C5399709


### CUI-SUIs.csv

In [21]:
query = "SELECT DISTINCT CUI, SUI FROM {0}.MRCONSO WHERE {0}.MRCONSO.ISPREF = 'Y' AND {0}.MRCONSO.STT = 'PF' AND {0}.MRCONSO.TS = 'P' and {0}.MRCONSO.LAT = 'ENG'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':START_ID',':END_ID']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/CUI-SUIs.csv', header=True, index=False)
df.tail()

Unnamed: 0,:START_ID,:END_ID
4362832,C5399631,S20180336
4362833,C5399667,S20181268
4362834,C5399673,S11152989
4362835,C5399678,S20184507
4362836,C5399680,S20184458


### DEFs.csv

In [22]:
query = "With CUIlist as (SELECT DISTINCT CUI from {0}.MRCONSO where {0}.MRCONSO.ISPREF = 'Y' AND {0}.MRCONSO.STT = 'PF' AND {0}.MRCONSO.TS = 'P' and {0}.MRCONSO.LAT = 'ENG') SELECT DISTINCT {0}.MRDEF.ATUI, {0}.MRDEF.SAB, {0}.MRDEF.DEF FROM {0}.MRDEF inner join CUIlist on {0}.MRDEF.CUI = CUIlist.CUI WHERE SUPPRESS <> 'O' AND NOT ({0}.MRDEF.SAB LIKE 'MSH%' AND {0}.MRDEF.SAB <> 'MSH') AND NOT ({0}.MRDEF.SAB LIKE 'MDR%' AND {0}.MRDEF.SAB <> 'MDR')".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =['ATUI:ID','SAB','DEF']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/DEFs.csv', header=True, index=False)
df.tail()

Unnamed: 0,ATUI:ID,SAB,DEF
183355,AT100259128,MSH,A therapeutic method involving the use of sand...
183356,AT198129498,NCI,A human IgG2 monoclonal antibody directed agai...
183357,AT198076749,NCI,Serine/threonine-protein kinase/endoribonuclea...
183358,AT234297435,NCI,BMP/retinoic acid-inducible neural-specific pr...
183359,AT210368697,NCI,"Albumin protein in urine samples, which is a m..."


### DEFrel.csv

In [23]:
query = "SELECT DISTINCT ATUI, CUI FROM {0}.MRDEF WHERE SUPPRESS <> 'O'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':END_ID',':START_ID']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/DEFrel.csv', header=True, index=False)
df.tail()

Unnamed: 0,:END_ID,:START_ID
230890,AT198095991,C2361850
230891,AT210368697,C2362049
230892,AT205727209,C2362313
230893,AT205727210,C2362313
230894,AT230075797,C2362314


### NDCs.csv

In [24]:
query = "SELECT DISTINCT ATUI, ATV FROM {0}.MRSAT WHERE SAB = 'RXNORM' and ATN = 'NDC' and SUPPRESS <> 'O'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =['ATUI:ID','NDC']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/NDCs.csv', header=True, index=False)
df.tail()

Unnamed: 0,ATUI:ID,NDC
239398,AT260482524,71308002101
239399,AT260512670,71308002110
239400,AT260467440,71308030101
239401,AT260482525,71308030110
239402,AT260437071,63824011766


### NDCrel.csv

In [25]:
query = "SELECT DISTINCT ATUI, (SAB||' '||CODE) FROM {0}.MRSAT WHERE SAB = 'RXNORM' and ATN = 'NDC' and SUPPRESS <> 'O'".format(UMLSversion)
df = pd.read_sql_query(query, engine)
df.columns =[':END_ID',':START_ID']
df.to_csv(path_or_buf='UMLS-Graph-Extracts/NDCrel.csv', header=True, index=False)
df.tail()

Unnamed: 0,:END_ID,:START_ID
239398,AT260482524,RXNORM 2392243
239399,AT260512670,RXNORM 2392243
239400,AT260467440,RXNORM 2392245
239401,AT260482525,RXNORM 2392245
239402,AT260437071,RXNORM 2390630
