# Notebook: Create OPA2VEC associations file - Phenotypes
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Get phenotypes associated with diseases and chemicals. Create association file between entity and phenotypes, for use in opa2vec. Also output finalclasses.lst, a file that tells opa2vec which entities you would like the vectors for

In [2]:
import pandas as pd
import numpy as np
import subprocess
import pickle
import ast

##### Create CTD to CID map for all chemicals... 
will take ages to run, so might as well create full map instead of running each time I need it

In [None]:
# # Add PubChem CIDs to chems -- ONLY have to run this if pkl object not already made, takes a while you see
# df = pd.read_csv('allchemsthathavegenesID.txt', names=['ID'])

# # Export CTD IDs for conversion
# np.savetxt(r'CTDids.csv', df_chems.ID.unique(), fmt='%s')

# Run a pre-made script to convert them via an API
subprocess.check_output('python3 CTDtoCID.py allchemsthathavegenesID.txt', shell=True)
# This creates map as pickle object --> a python dict that you can load like so:
# # Load the dictionary object output by the above-run script
# def load_obj(name):
#     with open(name + '.pkl', 'rb') as f:
#         return pickle.load(f)

# ctd_cid_map = load_obj('ctd_cid_map')

### 1. Read in Disease and Chemical IDs
And map in the relevant ID format (Pubchem CID for chems, Disease ontology ID for Diseases)

In [132]:
df = pd.read_csv('entities.lst', names=['ID'])

In [133]:
df.head()

Unnamed: 0,Unnamed: 1,ID
ID,CID,DOID
MESH:D012559,,DOID:5419
MESH:D009404,,DOID:2590
MESH:D001749,,DOID:11054
MESH:D011471,,DOID:10283


In [134]:
# Create separate dfs of dis-vecs and chem-vecs ( in order to generate additional rows for df1)
dis = df.ID.map(lambda x: ('MESH' in x) | ('OMIM' in x))
chems = df.ID.map(lambda x: ('MESH' not in x) & ('OMIM' not in x))

df_chems = df[chems]
df_dis = df[dis]
df_chems = df_chems.reset_index(drop=True)
df_dis = df_dis.reset_index(drop=True)

TypeError: argument of type 'float' is not iterable

In [None]:
df_chems.head()

In [None]:
## Add PubChem CIDs to chems -- ONLY have to run this if pkl object not already made, takes a while you see
# Export CTD IDs for conversion
# np.savetxt(r'CTDids.csv', df_chems.ID.unique(), fmt='%s')

# # Run a pre-made script to convert them via an API
# subprocess.check_output('python3 CTDtoCID.py CTDids.csv', shell=True)

In [None]:
# Load the dictionary object output by the above-run script
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

ctd_cid_map = load_obj('ctd_cid_map')

In [None]:
# Use the created map to add the CID IDs to the df
df_chems['CID'] = df_chems.ID.map(lambda x: ctd_cid_map[x] if x in ctd_cid_map else np.nan)

# Clean
df_chems['CID'] = df_chems.CID.map(lambda x: str(x))
df_chems['CID'] = df_chems.CID.map(lambda x: x.replace("'", "").replace("b", ""))
df_chems.columns = ['CTDid', 'CID']

In [None]:
## Add Disease Ontology ID to Diseases
# Import Mona's MESH to DOID dict
mesh_doid = {}
with open("../../multi-drug-embedding/data/mesh2doid.dict") as f:
    for line in f:
       mesh_doid = ast.literal_eval(line)

omim_doid = {}
with open("../../multi-drug-embedding/data/omim2doid.dict") as f:
    for line in f:
       omim_doid = ast.literal_eval(line)

In [None]:
def dis_doid (dis):
    if 'MESH' in dis:
        dis = dis.replace('MESH:', '')
        try: return mesh_doid[dis]
        except: return np.nan
    elif 'OMIM' in dis:
        dis = dis.replace('OMIM:', '')
        try: return omim_doid[dis]
        except: return np.nan
    else:
        print('big whoopsies')

df_dis['doid'] = df_dis.ID.map(lambda x: dis_doid(x))

In [None]:
df_dis.head()

In [None]:
df_dis.doid.isna().sum()

In [None]:
df_dis.shape

In [None]:
'D015419' in mesh_doid

In [None]:
mesh_doid

In [None]:
## Attempt2

In [None]:
# Download a doid-mesh file offered by human disease ontology
url = "https://raw.githubusercontent.com/obophenotype/human-disease-ontology/master/src/experimental/align-doid-to-mesh.tsv"
!wget -O doid_mesh.tsv $url

In [None]:
doid_mesh = pd.read_csv('doid_mesh.tsv', sep='\t', names=['DOID', 'doid-dis', 'MESH', 'mesh-dis', 'bool', 'nans']
                       , nrows=6768)

In [None]:
doid_mesh.head()

In [None]:
# doid_mesh[['MESH', 'DOID']].to_dict()
# doid_mesh['MESH'] = doid_mesh.MESH.map(lambda x: x.strip('MESH:'))
# doid_mesh['DOID'] = doid_mesh.DOID.map(lambda x: x.strip('DOID:'))

mesh_to_doid = doid_mesh.set_index('MESH').DOID.to_dict()
# mesh_to_doid = {v: k for k, v in mesh_to_doid.items()}

In [None]:
df_dis.doid.isna().sum()

In [None]:
len(df_dis.doid)

In [None]:
len(mesh_to_doid)

In [None]:
mesh_to_doid

In [None]:
df_dis['DisOntID'] = df_dis.ID.map(lambda x: mesh_to_doid.get(x, np.nan))

In [None]:
df_dis.sample(13)

In [None]:
# df_dis.doid.isna().sum()  # Mona's map
df_dis.DisOntID.isna().sum() # Map I created

### Based on above I will proceed with my map disease to phen
Export the new CID and DOID DFs

In [None]:
df.head()

In [None]:
# Use the created map to add the CID IDs to the df
df['CID'] = df.ID.map(lambda x: ctd_cid_map[x] if x in ctd_cid_map else np.nan)

# Clean
df['CID'] = df.CID.map(lambda x: str(x))
df['CID'] = df.CID.map(lambda x: x.replace("'", "").replace("b", ""))
df.columns = ['ID', 'CID']

In [None]:
# Add Disease IDs
df['DOID'] = df.ID.map(lambda x: mesh_to_doid.get(x, np.nan))

In [None]:
df.sample(13)

In [None]:
df.to_csv('entities.lst', index=False)

### Below is a graveyard of code that was a start on parsing Disease ontology (human) from scratch for optimal maps

In [None]:
# def doider (id):
#     if id in mesh_to_doid:
#         return mesh_to_doid[id]
#     else:
#         return np.nan

# df_dis['doid'] = df_dis.ID.map(lambda x: doider(x))

In [None]:
# mesh_to_doid = {v: k for k, v in mesh_to_doid.items()}

In [None]:
# mesh_to_doid

In [None]:
# Let's try to manually parse the obo for human disease ontology, then MPO separately

In [None]:
# Download the obo 
# !wget -O ../../ontologies/humanDO.obo https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/HumanDO.obo

In [None]:
# with open('../../ontologies/humanDO.obo', ) as filer:
#     full_file_string = ''
#     for line in filer:
#         full_file_string += line

In [None]:
# full_file_list = full_file_string.split('[Term]')

In [None]:
# full_file_list[3:6]

In [None]:
# # Regx
# import re
# # (DOID|MESH|OMIM|UMLS)[^\\ ]*

In [None]:
# full_file_list[5]

In [None]:
# re.findall('(?:DOID|MESH|OMIM|UMLS_CUI)[.:0-9A-Za-z]*', full_file_list[5])

In [None]:
# m.group(1)

In [None]:
# processed_hdo = []
# for item in full_file_list:
#     processed_hdo.append(re.findall('(?:DOID|MESH|OMIM|UMLS_CUI)[.:0-9A-Za-z]*', item))

In [None]:
# processed_hdo[1]

In [None]:
# regex = re.compile(r'OMIM|MESH')
# filtered = [processed_hdo[1][0]] + [i for i in processed_hdo[1][1:] if regex.search(i)]

In [None]:
# filtered

In [None]:
# regex = re.compile(r'OMIM|MESH')
# doid_mesh_omim = []
# for lister in processed_hdo:
#         if len(lister) < 2:
#             doid_mesh_omim.append(np.nan)
#         else:
#             doid_mesh_omim.append([i for i in lister[1:] if regex.search(i)])
    

In [None]:
# doid_mesh_omim

In [None]:
# pd.DataFrame(processed_hdo)