# Notebook: Create OPA2VEC associations file - Phenotypes
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get phenotypes associated with diseases and chemicals. Create association file between entity and phenotypes, for use in opa2vec. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for <br>
(i) creates map of Disease IDs to DOIDS

In [53]:
import pandas as pd
import numpy as np
import subprocess
import pickle
import ast
import re

##### Create CTD to CID map for all chemicals... 
will take ages to run, so might as well create full map instead of running each time I need it

In [54]:
# # Add PubChem CIDs to chems -- ONLY have to run this if pkl object not already made, takes a while you see
# df = pd.read_csv('allchemsthathavegenesID.txt', names=['ID'])

# # Export CTD IDs for conversion
# np.savetxt(r'CTDids.csv', df_chems.ID.unique(), fmt='%s')

# Run a pre-made script to convert them via an API --> Creates CTD:CID map as ctd_cid_map.pkl
# subprocess.check_output('python3 CTDtoCID.py allchemsthathavegenesID.txt', shell=True)


In [55]:
# This creates map as pickle object --> a python dict that you can load like so:
# # Load the dictionary object output by the above-run script
# def load_obj(name):
#     with open(name + '.pkl', 'rb') as f:
#         return pickle.load(f)

# ctd_cid_map = load_obj('ctd_cid_map')

### 1. Read in Disease and Chemical IDs
And map in the relevant ID format (Pubchem CID for chems, Disease ontology ID for Diseases)

In [56]:
# Read in all of the CTD diseases that have positive gene association(s)
diseases = pd.read_csv('../ctd-to-nt/all-diseases-w-genes-ctd.txt', names=['MESH'])

In [57]:
# Read in the ctd map of various disease identifiers
ctd_dis = pd.read_csv('../ctd-to-nt/csvs/CTD_diseases.csv', skiprows=27)
ctd_dis['AltDiseaseIDs'] = ctd_dis.AltDiseaseIDs.astype(str)
ctd_dis = ctd_dis.drop(0)

In [58]:
ctd_dis.sample(3)

Unnamed: 0,# DiseaseName,DiseaseID,AltDiseaseIDs,Definition,ParentIDs,TreeNumbers,ParentTreeNumbers,Synonyms,SlimMappings
9430,"Pancreatic cancer, adult",MESH:C535836,,,MESH:D002277|MESH:D010190,C04.557.470.200/C535836|C04.588.274.761/C53583...,C04.557.470.200|C04.588.274.761|C04.588.322.47...,,Cancer|Digestive system disease|Endocrine syst...
6210,"Ichthyosis, X-Linked, without Steroid Sulfatas...",MESH:C564729,,,MESH:D016114,C16.131.831.512.420/C564729|C16.320.322.241/C5...,C16.131.831.512.420|C16.320.322.241|C16.320.56...,,Congenital abnormality|Genetic disease (inborn...
11517,"Spinal Muscular Atrophy, Distal, Autosomal Rec...",MESH:C564626,OMIM:607088,,MESH:D009134,C10.228.854.468/C564626|C10.574.562.500/C56462...,C10.228.854.468|C10.574.562.500|C10.668.467.500,DHMN3|DHMN4|DSMA3|HMN3|HMN4|HMN III|HMN IV|Neu...,Nervous system disease


In [59]:
# Isolate only rows with DOID
ctd_dis = ctd_dis[ctd_dis.AltDiseaseIDs.map(lambda x: 'DOID' in x)]

# Use regex to extract the DOID
r = re.compile('DOID:[0-9]*')

def get_doid(x, r):
    return (r.findall(x)[-1])

ctd_dis['AltDiseaseIDs'] = ctd_dis.AltDiseaseIDs.map(lambda x: get_doid(x, r))

In [60]:
# map ctd's mesh to doid
ctd_dis2doid = dict(zip(ctd_dis.DiseaseID, ctd_dis.AltDiseaseIDs))

In [98]:
diseases.sample(3)

Unnamed: 0,MESH,DOID
878,MESH:C536436,DOID:1925
1331,OMIM:616907,DOID:0110821
2426,MESH:C537104,DOID:0060232


In [86]:
# # Download a doid-mesh file offered by human disease ontology in order to make a second map from this source
# url = "https://raw.githubusercontent.com/obophenotype/human-disease-ontology/master/src/experimental/align-doid-to-mesh.tsv"
# !wget -O doid_mesh.tsv $url

In [85]:
doid_mesh = pd.read_csv('doid_mesh.tsv', sep='\t', names=['DOID', 'doid-dis', 'MESH', 'mesh-dis', 'bool', 'nans']
                       , nrows=6768)
mesh_to_doid = doid_mesh.set_index('MESH').DOID.to_dict()

In [87]:
diseases['DOID'] = diseases.MESH.map(lambda x: ctd_dis2doid.get(x))
diseases['DOID2'] = diseases.MESH.map(lambda x: mesh_to_doid.get(x))
# Note I also tried the maps mona gave me but they don't seem to add anything beyond these two

In [88]:
print(diseases.DOID.nunique())
print(diseases.DOID2.nunique())

2166
1586


In [97]:
# Merge the two columns of DOIDs
diseases['DOID'] = diseases.DOID.map(lambda x: np.nan if x is None else x)
diseases.DOID.fillna(diseases.DOID2, inplace=True)
del diseases['DOID2']

In [None]:
# Export this as a csv to be imported into opa-nn
df.to_csv('chem_dis_to_CID_DOID.csv', index=False)

In [None]:
## Everything below here is junked 

In [74]:
for a in f

SyntaxError: invalid syntax (<ipython-input-74-3adefc579fc1>, line 1)

In [7]:
# df = pd.read_csv('entities.lst') #, names=['ID']) # List of chems and diseases...should this incl vecs?

In [70]:
# df = pd.read_csv('../ctd-to-nt/all-ctd_chems_dis.txt', names=['ID']) # import all chems and dis in one col

In [71]:
# pd.read_csv('../ctd-to-nt/all-diseases-w-genes-ctd.txt')

In [72]:
# df.head()

In [73]:
# Create separate dfs of dis-vecs and chem-vecs ( in order to generate additional rows for df1)
dis = df.ID.map(lambda x: ('MESH' in x) | ('OMIM' in x))
chems = df.ID.map(lambda x: ('MESH' not in x) & ('OMIM' not in x))

df_chems = df[chems]
df_dis = df[dis]
df_chems = df_chems.reset_index(drop=True)
df_dis = df_dis.reset_index(drop=True)

In [28]:
df_chems.shape # 9554

(9554, 1)

In [29]:
df_dis.shape # 3191

(3191, 1)

In [30]:
df_dis.head()

Unnamed: 0,ID
0,MESH:D054198
1,MESH:D006948
2,MESH:D012640
3,MESH:D004827
4,MESH:D006331


In [14]:
## Add PubChem CIDs to chems -- ONLY have to run this if pkl object not already made, takes a while you see
# Export CTD IDs for conversion
# np.savetxt(r'CTDids.csv', df_chems.ID.unique(), fmt='%s')

# # Run a pre-made script to convert them via an API
# subprocess.check_output('python3 CTDtoCID.py CTDids.csv', shell=True)

In [31]:
# Load the dictionary object output by the above-run script
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

ctd_cid_map = load_obj('ctd_cid_map')

In [32]:
len(ctd_cid_map)

409

In [33]:
# Use the created map to add the CID IDs to the df
df_chems['CID'] = df_chems.ID.map(lambda x: ctd_cid_map[x] if x in ctd_cid_map else np.nan)

# Clean
df_chems['CID'] = df_chems.CID.map(lambda x: str(x))
df_chems['CID'] = df_chems.CID.map(lambda x: x.replace("'", "").replace("b", ""))
df_chems.columns = ['CTDid', 'CID']

In [34]:
df_chems.head()

Unnamed: 0,CTDid,CID
0,C046983,
1,C112297,656732.0
2,C039775,114709.0
3,C425777,
4,C476756,


In [35]:
df_chems.CID.nunique() #165

165

In [36]:
## Add Disease Ontology ID to Diseases
# Import Mona's MESH to DOID dict
mesh_doid = {}
with open("../../multi-drug-embedding/data/mesh2doid.dict") as f:
    for line in f:
       mesh_doid = ast.literal_eval(line)

omim_doid = {}
with open("../../multi-drug-embedding/data/omim2doid.dict") as f:
    for line in f:
       omim_doid = ast.literal_eval(line)

In [37]:
def dis_doid (dis):
    if 'MESH' in dis:
        dis = dis.replace('MESH:', '')
        try: return mesh_doid[dis]
        except: return np.nan
    elif 'OMIM' in dis:
        dis = dis.replace('OMIM:', '')
        try: return omim_doid[dis]
        except: return np.nan
    else:
        print('big whoopsies')

df_dis['doid'] = df_dis.ID.map(lambda x: dis_doid(x))

In [38]:
df_dis.head()

Unnamed: 0,ID,doid
0,MESH:D054198,
1,MESH:D006948,
2,MESH:D012640,DOID_11832
3,MESH:D004827,DOID_1826
4,MESH:D006331,DOID_114


In [40]:
df_dis.doid.nunique() # 1584 which is ~ half 

1584

In [15]:
df_dis.shape

(2567, 2)

In [16]:
'D015419' in mesh_doid

True

In [17]:
mesh_doid

{'D015419': 'DOID_2476',
 'D010930': 'DOID_3482',
 'D015140': 'DOID_8725',
 'D015417': 'DOID_2477',
 'D010390': 'DOID_11656',
 'D017099': 'DOID_14176',
 'D000308': 'DOID_3947',
 'D001787': 'DOID_4176',
 'D009771': 'DOID_10933',
 'C563491': 'DOID_0060380',
 'D000307': 'DOID_9553',
 'C563256': 'DOID_0060448',
 'C562385': 'DOID_11252',
 'C562386': 'DOID_10825',
 'C537914': 'DOID_0060387',
 'D007835': 'DOID_9537',
 'C567232': 'DOID_0060397',
 'D010392': 'DOID_9182',
 'D007787': 'DOID_10604',
 'D000542': 'DOID_841',
 'D000236': 'DOID_657',
 'D003229': 'DOID_4251',
 'D005879': 'DOID_11119',
 'C536201': 'DOID_0050802',
 'D000544': 'DOID_10652',
 'D008554': 'DOID_5052',
 'D008557': 'DOID_4253',
 'D008556': 'DOID_1761',
 'D000238': 'DOID_3828',
 'D005870': 'DOID_200',
 'D005873': 'DOID_10718',
 'C536194': 'DOID_14720',
 'C536196': 'DOID_14757',
 'D000783': 'DOID_178',
 'D011469': 'DOID_47',
 'D053358': 'DOID_14793',
 'C536198': 'DOID_14775',
 'D000789': 'DOID_8805',
 'D002422': 'DOID_3222',
 'D

In [None]:
## Attempt2

In [41]:
# Download a doid-mesh file offered by human disease ontology
url = "https://raw.githubusercontent.com/obophenotype/human-disease-ontology/master/src/experimental/align-doid-to-mesh.tsv"
!wget -O doid_mesh.tsv $url

--2019-02-13 10:29:23--  https://raw.githubusercontent.com/obophenotype/human-disease-ontology/master/src/experimental/align-doid-to-mesh.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.36.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.36.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1082256 (1.0M) [text/plain]
Saving to: ‘doid_mesh.tsv’


2019-02-13 10:29:23 (6.53 MB/s) - ‘doid_mesh.tsv’ saved [1082256/1082256]



In [76]:
doid_mesh = pd.read_csv('doid_mesh.tsv', sep='\t', names=['DOID', 'doid-dis', 'MESH', 'mesh-dis', 'bool', 'nans']
                       , nrows=6768)

In [77]:
doid_mesh.head()

Unnamed: 0,DOID,doid-dis,MESH,mesh-dis,bool,nans
0,DOID:0001816,angiosarcoma,MESH:D006394,Hemangiosarcoma,False,
1,DOID:0001816,angiosarcoma,MESH:D006394,Hemangiosarcoma,True,
2,DOID:0002116,pterygium,MESH:D011625,Pterygium,False,
3,DOID:0002116,pterygium,MESH:D011625,Pterygium,True,
4,DOID:0014667,disease of metabolism,MESH:D008659,Metabolic Diseases,False,


In [78]:
# doid_mesh[['MESH', 'DOID']].to_dict()
# doid_mesh['MESH'] = doid_mesh.MESH.map(lambda x: x.strip('MESH:'))
# doid_mesh['DOID'] = doid_mesh.DOID.map(lambda x: x.strip('DOID:'))

mesh_to_doid = doid_mesh.set_index('MESH').DOID.to_dict()
# mesh_to_doid = {v: k for k, v in mesh_to_doid.items()}

In [79]:
df_dis.doid.isna().sum()

NameError: name 'df_dis' is not defined

In [46]:
len(df_dis.doid)

3191

In [47]:
len(mesh_to_doid)

3153

In [48]:
mesh_to_doid

{'MESH:D006394': 'DOID:0001816',
 'MESH:D011625': 'DOID:10526',
 'MESH:D008659': 'DOID:0014667',
 'MESH:D065632': 'DOID:0050012',
 'MESH:D012373': 'DOID:0050052',
 'MESH:D004887': 'DOID:0050061',
 'MESH:C536166': 'DOID:0050083',
 'MESH:D003141': 'DOID:0050117',
 'MESH:D004670': 'DOID:0050118',
 'MESH:D051359': 'DOID:0050120',
 'MESH:C535601': 'DOID:0050125',
 'MESH:D019595': 'DOID:12206',
 'MESH:D012852': 'DOID:0050127',
 'MESH:D010854': 'DOID:13902',
 'MESH:D007619': 'DOID:9563',
 'MESH:D059249': 'DOID:0050147',
 'MESH:D011015': 'DOID:3240',
 'MESH:D012678': 'DOID:0050155',
 'MESH:D054990': 'DOID:0050156',
 'MESH:D018549': 'DOID:2798',
 'MESH:C562470': 'DOID:0050158',
 'MESH:C562489': 'DOID:0050159',
 'MESH:C571912': 'DOID:0050160',
 'MESH:C538275': 'DOID:0050167',
 'MESH:D016884': 'DOID:14040',
 'MESH:D008178': 'DOID:0050169',
 'MESH:D004675': 'DOID:0050175',
 'MESH:D004892': 'DOID:0050185',
 'MESH:D006478': 'DOID:0050195',
 'MESH:D015624': 'DOID:0050214',
 'MESH:C535275': 'DOID:0050

In [49]:
df_dis['DisOntID'] = df_dis.ID.map(lambda x: mesh_to_doid.get(x, np.nan))

In [50]:
df_dis.sample(13)

Unnamed: 0,ID,doid,DisOntID
2026,MESH:D010019,,DOID:1019
385,MESH:D017880,,
2991,MESH:C535935,,
297,MESH:D009021,DOID_2560,DOID:2560
2894,MESH:D019052,DOID_9478,DOID:9478
1163,MESH:D006938,DOID_13810,DOID:13810
278,MESH:D016471,DOID_5425,DOID:5425
957,MESH:D012872,DOID_8508,DOID:8508
803,MESH:D005157,,
1904,MESH:D014178,,


In [52]:
# df_dis.doid.isna().sum()  # Mona's map
df_dis.DisOntID.isna().sum() # Map I created

1570

In [None]:
## Okay so I've gotten DOID from two places and the latter one is better (DisOntID)

### Based on above I will proceed with my map disease to phen
Export the new CID and DOID DFs

In [53]:
df.head()

Unnamed: 0,ID
0,C046983
1,C112297
2,C039775
3,C425777
4,C476756


In [54]:
# Use the created map to add the CID IDs to the df
df['CID'] = df.ID.map(lambda x: ctd_cid_map[x] if x in ctd_cid_map else np.nan)

# Clean
df['CID'] = df.CID.map(lambda x: str(x))
df['CID'] = df.CID.map(lambda x: x.replace("'", "").replace("b", ""))
df.columns = ['ID', 'CID']

In [55]:
# Add Disease IDs
df['DOID'] = df.ID.map(lambda x: mesh_to_doid.get(x, np.nan))

In [56]:
df.sample(13)

Unnamed: 0,ID,CID,DOID
8412,C067114,,
6820,D008771,,
7914,D011794,,
12741,MESH:D016097,,
4441,D005279,,
8595,C041711,,
5984,C527236,,
2725,D002220,,
9596,MESH:D006930,,
8976,C402769,,


In [57]:
df.to_csv('chem_dis_to_CID_DOID.csv', index=False)

In [58]:
df.CID.nunique()

165

### Below is a graveyard of code that was a start on parsing Disease ontology (human) from scratch for optimal maps

In [None]:
# def doider (id):
#     if id in mesh_to_doid:
#         return mesh_to_doid[id]
#     else:
#         return np.nan

# df_dis['doid'] = df_dis.ID.map(lambda x: doider(x))

In [None]:
# mesh_to_doid = {v: k for k, v in mesh_to_doid.items()}

In [None]:
# mesh_to_doid

In [None]:
# Let's try to manually parse the obo for human disease ontology, then MPO separately

In [None]:
# Download the obo 
# !wget -O ../../ontologies/humanDO.obo https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/HumanDO.obo

In [None]:
# with open('../../ontologies/humanDO.obo', ) as filer:
#     full_file_string = ''
#     for line in filer:
#         full_file_string += line

In [None]:
# full_file_list = full_file_string.split('[Term]')

In [None]:
# full_file_list[3:6]

In [None]:
# # Regx
# import re
# # (DOID|MESH|OMIM|UMLS)[^\\ ]*

In [None]:
# full_file_list[5]

In [None]:
# re.findall('(?:DOID|MESH|OMIM|UMLS_CUI)[.:0-9A-Za-z]*', full_file_list[5])

In [None]:
# m.group(1)

In [None]:
# processed_hdo = []
# for item in full_file_list:
#     processed_hdo.append(re.findall('(?:DOID|MESH|OMIM|UMLS_CUI)[.:0-9A-Za-z]*', item))

In [None]:
# processed_hdo[1]

In [None]:
# regex = re.compile(r'OMIM|MESH')
# filtered = [processed_hdo[1][0]] + [i for i in processed_hdo[1][1:] if regex.search(i)]

In [None]:
# filtered

In [None]:
# regex = re.compile(r'OMIM|MESH')
# doid_mesh_omim = []
# for lister in processed_hdo:
#         if len(lister) < 2:
#             doid_mesh_omim.append(np.nan)
#         else:
#             doid_mesh_omim.append([i for i in lister[1:] if regex.search(i)])
    

In [None]:
# doid_mesh_omim

In [None]:
# pd.DataFrame(processed_hdo)