# Notebook: Create OPA2VEC vectors for diseases on Disease Ontology
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: For all the diseases in CTD, embed them in the context of Disease Ontology

In [10]:
# Libraries
import pandas as pd
import numpy as np
import subprocess

In [19]:
### 1. Import all diseases that I want to embed
# I guess I might as well do all the diseases in CTD and have the file rather than create vectors each time
# that I change which diseases I'm using
diseases = pd.read_csv('../ctd-to-nt/all-diseases-w-genes-ctd.txt', names=['ID'])

In [20]:
diseases.head()

Unnamed: 0,ID
0,MESH:D003920
1,MESH:D003924
2,MESH:D008113
3,MESH:D009369
4,MESH:D009765


In [14]:
### 2. Map Disease IDs to Disease Ontology IDs (DOIDs)
mapper = pd.read_csv('entities.lst')

In [15]:
mapper.head()

Unnamed: 0,ID,CID,DOID
0,MESH:D012559,,DOID:5419
1,MESH:D009404,,DOID:2590
2,MESH:D001749,,DOID:11054
3,MESH:D011471,,DOID:10283
4,MESH:D008106,,


In [16]:
id_to_doid = dict(zip(mapper.ID, mapper.DOID))

In [21]:
# Apply the mapping, thereby creating DOID
diseases['DOID'] = diseases.ID
diseases['DOID'] = diseases['DOID'].map(id_to_doid)

In [22]:
diseases.head()

Unnamed: 0,ID,DOID
0,MESH:D003920,DOID:9351
1,MESH:D003924,DOID:9352
2,MESH:D008113,DOID:916
3,MESH:D009369,DOID:162
4,MESH:D009765,DOID:9970


In [23]:
### 3. Create Associations file
diseases['obo_url'] = '<' + 'http://purl.obolibrary.org/obo/' + diseases.DOID.str.replace(':', '_')  + '>'

In [24]:
diseases.head()

Unnamed: 0,ID,DOID,obo_url
0,MESH:D003920,DOID:9351,<http://purl.obolibrary.org/obo/DOID_9351>
1,MESH:D003924,DOID:9352,<http://purl.obolibrary.org/obo/DOID_9352>
2,MESH:D008113,DOID:916,<http://purl.obolibrary.org/obo/DOID_916>
3,MESH:D009369,DOID:162,<http://purl.obolibrary.org/obo/DOID_162>
4,MESH:D009765,DOID:9970,<http://purl.obolibrary.org/obo/DOID_9970>


In [28]:
# Output an association file 
np.savetxt(r'associations_doid.txt', diseases[['ID','obo_url']].values, fmt='%s')

# And entities file
np.savetxt(r'entities_doid.lst', diseases.ID.unique(), fmt='%s')

In [29]:
### 4. Run Opa2Vec

In [30]:
# Ok let's actually run it 
subprocess.check_output('(cd ../../opa2vec/ ; python2 runOPA2Vec.py -ontology ../ontologies/humanDO.obo -associations ../msc-thesis/opa/associations_doid.txt -entities ../msc-thesis/opa/entities_doid.lst -outfile ../msc-thesis/opa/do-vecs.lst)', shell=True)

b'Loading of Axioms ...\nLoading ...\n    1%\n    2%\n    3%\n    5%\n    7%\n    10%\n    13%\n    17%\n    19%\n    23%\n    26%\n    30%\n    35%\n    41%\n    46%\n    51%\n    57%\n    63%\n    70%\n    76%\n    82%\n    90%\n    95%\n    ... finished\n    ... finished\nProperty Saturation Initialization ...\n    ... finished\nReflexive Property Computation ...\n    ... finished\nObject Property Hierarchy and Composition Computation ...\n    ... finished\nContext Initialization ...\n    ... finished\nConsistency Checking ...\n    100%\n    ... finished\nClass Taxonomy Computation ...\n    16%\n    50%\n    93%\n    ... finished\n***********OPA2Vec Running ...***********\n\n***********Ontology Processing ...***********\n\n***********Ontology Processing Complete ...***********\n\n***********Metadata Extraction ...***********\n\n***********Metadata Extraction Complete ...***********\n\n***********Propagate Associations through hierarchy ...***********\n\n***********Association propag