# Notebook: Create OPA2VEC associations file - Phenotypes
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get phenotypes associated with diseases and chemicals. Create association file between entity and phenotypes, for use in opa2vec. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for <br>
(i) creates map of Disease IDs to DOIDS

In [1]:
import pandas as pd
import numpy as np
import subprocess
import pickle
import ast
import re

##### Create CTD to CID map for all chemicals... 
will take ages to run, so might as well create full map instead of running each time I need it

In [2]:
# # Add PubChem CIDs to chems -- ONLY have to run this if pkl object not already made, takes a while you see
# df = pd.read_csv('allchemsthathavegenesID.txt', names=['ID'])

# # Export CTD IDs for conversion
# np.savetxt(r'CTDids.csv', df_chems.ID.unique(), fmt='%s')

# Run a pre-made script to convert them via an API --> Creates CTD:CID map as ctd_cid_map.pkl
# subprocess.check_output('python3 CTDtoCID.py allchemsthathavegenesID.txt', shell=True)


In [3]:
# This creates map as pickle object --> a python dict that you can load like so:
# # Load the dictionary object output by the above-run script
# def load_obj(name):
#     with open(name + '.pkl', 'rb') as f:
#         return pickle.load(f)

# ctd_cid_map = load_obj('ctd_cid_map')

### 1. Read in Disease and Chemical IDs
And map in the relevant ID format (Pubchem CID for chems, Disease ontology ID for Diseases)

In [4]:
# Read in all of the CTD diseases that have positive gene association(s)
diseases = pd.read_csv('../ctd-to-nt/all-diseases-w-genes-ctd.txt', names=['MESH'])

In [5]:
# Read in the ctd map of various disease identifiers
ctd_dis = pd.read_csv('../ctd-to-nt/csvs/CTD_diseases.csv', skiprows=27)
ctd_dis['AltDiseaseIDs'] = ctd_dis.AltDiseaseIDs.astype(str)
ctd_dis = ctd_dis.drop(0)

In [6]:
ctd_dis.sample(3)

Unnamed: 0,# DiseaseName,DiseaseID,AltDiseaseIDs,Definition,ParentIDs,TreeNumbers,ParentTreeNumbers,Synonyms,SlimMappings
1688,Calcific Aortic Disease with Immunologic Abnor...,MESH:C566182,,,MESH:D001018|MESH:D002114,C14.907.109/C566182|C18.452.174.130/C566182,C14.907.109|C18.452.174.130,,Cardiovascular disease|Metabolic disease
10973,"Sarcoidosis, Early-Onset",MESH:C563714,,,MESH:D012507,C15.604.515.827/C563714,C15.604.515.827,,Lymphatic disease
12938,X-Linked Chondrodysplasia Punctata 1,MESH:C580533,,,MESH:D002806|MESH:D040181,C05.116.099.708.195/C580533|C16.320.322/C580533,C05.116.099.708.195|C16.320.322,Arylsulfatase E Deficiency|Cdpx1|Chondrodyspla...,Genetic disease (inborn)|Musculoskeletal disease


In [108]:
# Isolate only rows with DOID
ctd_dis = ctd_dis[ctd_dis.AltDiseaseIDs.map(lambda x: 'DOID' in x)]

# Use regex to extract the DOID
r = re.compile('DOID:[0-9]*')

def get_doid(x, r):
    return (r.findall(x)[-1])

ctd_dis['AltDiseaseIDs'] = ctd_dis.AltDiseaseIDs.map(lambda x: get_doid(x, r))

In [109]:
# map ctd's mesh to doid
ctd_dis2doid = dict(zip(ctd_dis.DiseaseID, ctd_dis.AltDiseaseIDs))

In [110]:
diseases.sample(6)

Unnamed: 0,MESH
2885,OMIM:615473
3893,OMIM:613672
4847,OMIM:616402
5783,OMIM:616469
3309,MESH:D016883
1348,MESH:C565833


In [111]:
# # Download a doid-mesh file offered by human disease ontology in order to make a second map from this source
# url = "https://raw.githubusercontent.com/obophenotype/human-disease-ontology/master/src/experimental/align-doid-to-mesh.tsv"
# !wget -O doid_mesh.tsv $url

In [112]:
doid_mesh = pd.read_csv('doid_mesh.tsv', sep='\t', names=['DOID', 'doid-dis', 'MESH', 'mesh-dis', 'bool', 'nans']
                       , nrows=6768)
mesh_to_doid = doid_mesh.set_index('MESH').DOID.to_dict()

In [113]:
diseases['DOID'] = diseases.MESH.map(lambda x: ctd_dis2doid.get(x))
diseases['DOID2'] = diseases.MESH.map(lambda x: mesh_to_doid.get(x))
# Note I also tried the maps mona gave me but they don't seem to add anything beyond these two

In [114]:
print(diseases.DOID.nunique())
print(diseases.DOID2.nunique())

2166
1586


In [115]:
# Merge the two columns of DOIDs
diseases['DOID'] = diseases.DOID.map(lambda x: np.nan if x is None else x)
diseases.DOID.fillna(diseases.DOID2, inplace=True)
del diseases['DOID2']

In [116]:
# Export this as a csv to be imported into opa-nn
diseases.to_csv('ctd_dis_to_doid.csv', index=False)