# Notebook: Create OPA2VEC associations file - Phenotypes
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get phenotypes associated with diseases and chemicals. Create association file between entity and phenotypes, for use in opa2vec. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for

In [16]:
import pandas as pd
import numpy as np
import subprocess
import pickle
import ast


### 1. Read in Disease and Chemical IDs
And map in the relevant ID format (Pubchem CID for chems, Disease ontology ID for Diseases)

In [8]:
df = pd.read_csv('entities.lst', names=['ID'])

In [9]:
df.head()

Unnamed: 0,ID
0,MESH:D012559
1,MESH:D009404
2,MESH:D001749
3,MESH:D011471
4,MESH:D008106


In [10]:
# Create separate dfs of dis-vecs and chem-vecs ( in order to generate additional rows for df1)
dis = df.ID.map(lambda x: ('MESH' in x) | ('OMIM' in x))
chems = df.ID.map(lambda x: ('MESH' not in x) & ('OMIM' not in x))

df_chems = df[chems]
df_dis = df[dis]
df_chems = df_chems.reset_index(drop=True)
df_dis = df_dis.reset_index(drop=True)

In [13]:
df_chems.head()

Unnamed: 0,ID
0,D014635
1,D014800
2,C016837
3,C104536
4,C088658


In [14]:
## Add PubChem CIDs to chems
# Export CTD IDs for conversion
np.savetxt(r'CTDids.csv', df_chems.ID.unique(), fmt='%s')

# Run a pre-made script to convert them via an API
subprocess.check_output('python3 CTDtoCID.py CTDids.csv', shell=True)

b"Exception caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>\nException caught:  <class 'Exception'>

In [17]:
# Load the dictionary object output by the above-run script
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

ctd_cid_map = load_obj('ctd_cid_map')

In [19]:
# Use the created map to add the CID IDs to the df
df_chems['CID'] = df_chems.ID.map(lambda x: ctd_cid_map[x] if x in ctd_cid_map else np.nan)

# Clean
df_chems['CID'] = df_chems.CID.map(lambda x: str(x))
df_chems['CID'] = df_chems.CID.map(lambda x: x.replace("'", "").replace("b", ""))
df_chems.columns = ['CTDid', 'CID']

In [34]:
## Add Disease Ontology ID to Diseases
# Import Mona's MESH to DOID dict
mesh_doid = {}
with open("../../multi-drug-embedding/data/mesh2doid.dict") as f:
    for line in f:
       mesh_doid = ast.literal_eval(line)

omim_doid = {}
with open("../../multi-drug-embedding/data/omim2doid.dict") as f:
    for line in f:
       omim_doid = ast.literal_eval(line)

In [43]:
def dis_doid (dis):
    if 'MESH' in dis:
        dis = dis.replace('MESH:', '')
        return mesh_doid[dis]
    elif 'OMIM' in dis:
        dis = dis.replace('OMIM:', '')
        return omim_doid[dis]
    else:
        print('big whoopsies')

df_dis['doid'] = df_dis.ID.map(lambda x: dis_doid(x))

KeyError: 'D012559'

In [44]:
df_dis.head()

Unnamed: 0,ID
0,MESH:D012559
1,MESH:D009404
2,MESH:D001749
3,MESH:D011471
4,MESH:D008106


In [48]:
'D015419' in mesh_doid

True

In [47]:
mesh_doid

{'D015419': 'DOID_2476',
 'D010930': 'DOID_3482',
 'D015140': 'DOID_8725',
 'D015417': 'DOID_2477',
 'D010390': 'DOID_11656',
 'D017099': 'DOID_14176',
 'D000308': 'DOID_3947',
 'D001787': 'DOID_4176',
 'D009771': 'DOID_10933',
 'C563491': 'DOID_0060380',
 'D000307': 'DOID_9553',
 'C563256': 'DOID_0060448',
 'C562385': 'DOID_11252',
 'C562386': 'DOID_10825',
 'C537914': 'DOID_0060387',
 'D007835': 'DOID_9537',
 'C567232': 'DOID_0060397',
 'D010392': 'DOID_9182',
 'D007787': 'DOID_10604',
 'D000542': 'DOID_841',
 'D000236': 'DOID_657',
 'D003229': 'DOID_4251',
 'D005879': 'DOID_11119',
 'C536201': 'DOID_0050802',
 'D000544': 'DOID_10652',
 'D008554': 'DOID_5052',
 'D008557': 'DOID_4253',
 'D008556': 'DOID_1761',
 'D000238': 'DOID_3828',
 'D005870': 'DOID_200',
 'D005873': 'DOID_10718',
 'C536194': 'DOID_14720',
 'C536196': 'DOID_14757',
 'D000783': 'DOID_178',
 'D011469': 'DOID_47',
 'D053358': 'DOID_14793',
 'C536198': 'DOID_14775',
 'D000789': 'DOID_8805',
 'D002422': 'DOID_3222',
 'D