# Notebook: Create OPA2VEC associations file - Phenotypes
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get phenotypes associated with diseases and chemicals. Create association file between entity and phenotypes, for use in opa2vec. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for <br>
(i) creates map of Disease IDs to DOIDS

In [53]:
import pandas as pd
import numpy as np
import subprocess
import pickle
import ast
import re

##### Create CTD to CID map for all chemicals... 
will take ages to run, so might as well create full map instead of running each time I need it

In [54]:
# # Add PubChem CIDs to chems -- ONLY have to run this if pkl object not already made, takes a while you see
# df = pd.read_csv('allchemsthathavegenesID.txt', names=['ID'])

# # Export CTD IDs for conversion
# np.savetxt(r'CTDids.csv', df_chems.ID.unique(), fmt='%s')

# Run a pre-made script to convert them via an API --> Creates CTD:CID map as ctd_cid_map.pkl
# subprocess.check_output('python3 CTDtoCID.py allchemsthathavegenesID.txt', shell=True)


In [55]:
# This creates map as pickle object --> a python dict that you can load like so:
# # Load the dictionary object output by the above-run script
# def load_obj(name):
#     with open(name + '.pkl', 'rb') as f:
#         return pickle.load(f)

# ctd_cid_map = load_obj('ctd_cid_map')

### 1. Read in Disease and Chemical IDs
And map in the relevant ID format (Pubchem CID for chems, Disease ontology ID for Diseases)

In [56]:
# Read in all of the CTD diseases that have positive gene association(s)
diseases = pd.read_csv('../ctd-to-nt/all-diseases-w-genes-ctd.txt', names=['MESH'])

In [57]:
# Read in the ctd map of various disease identifiers
ctd_dis = pd.read_csv('../ctd-to-nt/csvs/CTD_diseases.csv', skiprows=27)
ctd_dis['AltDiseaseIDs'] = ctd_dis.AltDiseaseIDs.astype(str)
ctd_dis = ctd_dis.drop(0)

In [58]:
ctd_dis.sample(3)

Unnamed: 0,# DiseaseName,DiseaseID,AltDiseaseIDs,Definition,ParentIDs,TreeNumbers,ParentTreeNumbers,Synonyms,SlimMappings
9430,"Pancreatic cancer, adult",MESH:C535836,,,MESH:D002277|MESH:D010190,C04.557.470.200/C535836|C04.588.274.761/C53583...,C04.557.470.200|C04.588.274.761|C04.588.322.47...,,Cancer|Digestive system disease|Endocrine syst...
6210,"Ichthyosis, X-Linked, without Steroid Sulfatas...",MESH:C564729,,,MESH:D016114,C16.131.831.512.420/C564729|C16.320.322.241/C5...,C16.131.831.512.420|C16.320.322.241|C16.320.56...,,Congenital abnormality|Genetic disease (inborn...
11517,"Spinal Muscular Atrophy, Distal, Autosomal Rec...",MESH:C564626,OMIM:607088,,MESH:D009134,C10.228.854.468/C564626|C10.574.562.500/C56462...,C10.228.854.468|C10.574.562.500|C10.668.467.500,DHMN3|DHMN4|DSMA3|HMN3|HMN4|HMN III|HMN IV|Neu...,Nervous system disease


In [59]:
# Isolate only rows with DOID
ctd_dis = ctd_dis[ctd_dis.AltDiseaseIDs.map(lambda x: 'DOID' in x)]

# Use regex to extract the DOID
r = re.compile('DOID:[0-9]*')

def get_doid(x, r):
    return (r.findall(x)[-1])

ctd_dis['AltDiseaseIDs'] = ctd_dis.AltDiseaseIDs.map(lambda x: get_doid(x, r))

In [60]:
# map ctd's mesh to doid
ctd_dis2doid = dict(zip(ctd_dis.DiseaseID, ctd_dis.AltDiseaseIDs))

In [98]:
diseases.sample(3)

Unnamed: 0,MESH,DOID
878,MESH:C536436,DOID:1925
1331,OMIM:616907,DOID:0110821
2426,MESH:C537104,DOID:0060232


In [86]:
# # Download a doid-mesh file offered by human disease ontology in order to make a second map from this source
# url = "https://raw.githubusercontent.com/obophenotype/human-disease-ontology/master/src/experimental/align-doid-to-mesh.tsv"
# !wget -O doid_mesh.tsv $url

In [85]:
doid_mesh = pd.read_csv('doid_mesh.tsv', sep='\t', names=['DOID', 'doid-dis', 'MESH', 'mesh-dis', 'bool', 'nans']
                       , nrows=6768)
mesh_to_doid = doid_mesh.set_index('MESH').DOID.to_dict()

In [87]:
diseases['DOID'] = diseases.MESH.map(lambda x: ctd_dis2doid.get(x))
diseases['DOID2'] = diseases.MESH.map(lambda x: mesh_to_doid.get(x))
# Note I also tried the maps mona gave me but they don't seem to add anything beyond these two

In [88]:
print(diseases.DOID.nunique())
print(diseases.DOID2.nunique())

2166
1586


In [97]:
# Merge the two columns of DOIDs
diseases['DOID'] = diseases.DOID.map(lambda x: np.nan if x is None else x)
diseases.DOID.fillna(diseases.DOID2, inplace=True)
del diseases['DOID2']

In [None]:
# Export this as a csv to be imported into opa-nn
df.to_csv('chem_dis_to_CID_DOID.csv', index=False)

In [None]:
# full_file_list = full_file_string.split('[Term]')

In [None]:
# full_file_list[3:6]

In [None]:
# # Regx
# import re
# # (DOID|MESH|OMIM|UMLS)[^\\ ]*

In [None]:
# full_file_list[5]

In [None]:
# re.findall('(?:DOID|MESH|OMIM|UMLS_CUI)[.:0-9A-Za-z]*', full_file_list[5])

In [None]:
# m.group(1)

In [None]:
# processed_hdo = []
# for item in full_file_list:
#     processed_hdo.append(re.findall('(?:DOID|MESH|OMIM|UMLS_CUI)[.:0-9A-Za-z]*', item))

In [None]:
# processed_hdo[1]

In [None]:
# regex = re.compile(r'OMIM|MESH')
# filtered = [processed_hdo[1][0]] + [i for i in processed_hdo[1][1:] if regex.search(i)]

In [None]:
# filtered

In [None]:
# regex = re.compile(r'OMIM|MESH')
# doid_mesh_omim = []
# for lister in processed_hdo:
#         if len(lister) < 2:
#             doid_mesh_omim.append(np.nan)
#         else:
#             doid_mesh_omim.append([i for i in lister[1:] if regex.search(i)])
    

In [None]:
# doid_mesh_omim

In [None]:
# pd.DataFrame(processed_hdo)