# Notebook: Create OPA2VEC vectors for proteins on Protein Ontology
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: For all the chems/diseases in CTD, embed them in the context of Protein Ontology on associated genes

In [32]:
import pandas as pd
import numpy as np
import scipy as sp
import subprocess
import math
import re

In [33]:
# Find out what format Protein ontology accepts

In [34]:
# Import association files gene/chem-protein positive associations
chem_g_uni = pd.read_csv('CHEM_GENE_UNIPROT.csv')
dis_g_uni = pd.read_csv('DIS_GENE_UNIPROT.csv')
dis_g_uni = dis_g_uni.dropna()
chem_g_uni = chem_g_uni.dropna()
print('chem shape', chem_g_uni.shape)
print('dis shape', dis_g_uni.shape)

chem shape (47611, 3)
dis shape (14945, 3)


In [35]:
chem_g_uni.sample(2)

Unnamed: 0,GeneID,UniprotID,ChemicalID
10334,7148,O95680,C574336
16420,799352,O13144,D014266


In [36]:
dis_g_uni.sample(2)

Unnamed: 0,GeneID,UniprotID,DiseaseID
2469,1586,P05093,MESH:D020018
3430,2222,B4DWP0,MESH:D010623


In [37]:
# http://purl.obolibrary.org/obo/PR_E9PDJ7E9PDJ7

In [38]:
# map the uri into the dfs
chem_g_uni['UniprotID'] = chem_g_uni.UniprotID.map(lambda x: 'http://purl.obolibrary.org/obo/PR_' + x)
dis_g_uni['UniprotID'] = dis_g_uni.UniprotID.map(lambda x: 'http://purl.obolibrary.org/obo/PR_' + x)

In [39]:
# Create associations files
# Output an association file for each of chem and dis
np.savetxt(r'associations_disPRO.txt', dis_g_uni[['DiseaseID', 'UniprotID']].values, fmt='%s')
np.savetxt(r'associations_chemPRO.txt', chem_g_uni[['ChemicalID', 'UniprotID']].values, fmt='%s')

# Merge the two
subprocess.call('cat associations_disPRO.txt > PROAssociations', shell=True)
subprocess.call('cat associations_chemPRO.txt >> PROAssociations', shell=True)

0

In [40]:
# Make entities file
entities = dis_g_uni.DiseaseID.unique().tolist() + chem_g_uni.ChemicalID.unique().tolist()
np.savetxt(r'PROentities.lst', entities, fmt='%s')

In [None]:
# Ok let's actually run it 
subprocess.check_output('(cd ../../opa2vec/ ; python2 runOPA2Vec.py -ontology ../ontologies/pro_nonreasoned.owl -associations ../msc-thesis/opa/PROAssociations -entities ../msc-thesis/opa/PROentities.lst -outfile ../msc-thesis/opa/PROVecs.lst)', shell=True)