# Notebook: Create OPA2VEC vectors for proteins on Protein Ontology
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: For all the chems/diseases in CTD, embed them in the context of Protein Ontology on associated genes

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import subprocess
import math
import re

In [3]:
# Find out what format Protein ontology accepts

In [4]:
# Import association files gene/chem-protein positive associations
chem_g_uni = pd.read_csv('CHEM_GENE_UNIPROT.csv')
dis_g_uni = pd.read_csv('DIS_GENE_UNIPROT.csv')
dis_g_uni = dis_g_uni.dropna()
chem_g_uni = chem_g_uni.dropna()
print('chem shape', chem_g_uni.shape)
print('dis shape', dis_g_uni.shape)

chem shape (47611, 3)
dis shape (14945, 3)


In [5]:
chem_g_uni.sample(2)

Unnamed: 0,GeneID,UniprotID,ChemicalID
14508,10293,Q9BWF2,D005492
33856,56063,B4DHR3,D002945


In [6]:
dis_g_uni.sample(2)

Unnamed: 0,GeneID,UniprotID,DiseaseID
15266,10801,Q9UHD8,MESH:D009361
30301,51300,Q9NPL8,MESH:C537475


In [7]:
# http://purl.obolibrary.org/obo/PR_E9PDJ7E9PDJ7

In [8]:
# map the uri into the dfs
chem_g_uni['UniprotID'] = chem_g_uni.UniprotID.map(lambda x: 'http://purl.obolibrary.org/obo/PR_' + x)
dis_g_uni['UniprotID'] = dis_g_uni.UniprotID.map(lambda x: 'http://purl.obolibrary.org/obo/PR_' + x)

In [9]:
# Create associations files
# Output an association file for each of chem and dis
np.savetxt(r'associations_disPRO.txt', dis_g_uni[['DiseaseID', 'UniprotID']].values, fmt='%s')
np.savetxt(r'associations_chemPRO.txt', chem_g_uni[['ChemicalID', 'UniprotID']].values, fmt='%s')

# Merge the two
subprocess.call('cat associations_disPRO.txt > PROAssociations', shell=True)
subprocess.call('cat associations_chemPRO.txt >> PROAssociations', shell=True)

0

In [10]:
# Make entities file
entities = dis_g_uni.DiseaseID.unique().tolist() + chem_g_uni.ChemicalID.unique().tolist()
np.savetxt(r'PROentities.lst', entities, fmt='%s')

In [11]:
# Add HINO
subprocess.call('cat ../validation/VMHhinoAssociations >> PROAssociations', shell=True)
subprocess.call('cat ../validation/VMHHINOentities.lst >> PROentities.lst', shell=True)

0

In [12]:
# Ok let's actually run it 
subprocess.check_output('(cd ../../opa2vec/ ; python2 runOPA2Vec.py -ontology ../ontologies/pro_nonreasoned.owl -associations ../msc-thesis/opa/PROAssociations -entities ../msc-thesis/opa/PROentities.lst -outfile ../msc-thesis/opa/PROVecs.lst)', shell=True)

b'Loading of Axioms ...\nLoading ...\n    1%\n    2%\n    3%\n    4%\n    5%\n    6%\n    7%\n    8%\n    9%\n    10%\n    11%\n    12%\n    13%\n    14%\n    15%\n    16%\n    17%\n    18%\n    19%\n    20%\n    21%\n    22%\n    23%\n    24%\n    25%\n    26%\n    27%\n    28%\n    29%\n    30%\n    31%\n    32%\n    33%\n    34%\n    35%\n    36%\n    37%\n    38%\n    39%\n    40%\n    41%\n    42%\n    43%\n    44%\n    45%\n    46%\n    47%\n    48%\n    49%\n    50%\n    51%\n    52%\n    53%\n    54%\n    55%\n    56%\n    57%\n    58%\n    59%\n    60%\n    61%\n    62%\n    63%\n    64%\n    65%\n    66%\n    67%\n    68%\n    69%\n    70%\n    71%\n    72%\n    73%\n    74%\n    75%\n    76%\n    77%\n    78%\n    79%\n    80%\n    81%\n    82%\n    83%\n    84%\n    85%\n    86%\n    87%\n    88%\n    89%\n    90%\n    91%\n    92%\n    93%\n    94%\n    95%\n    96%\n    97%\n    98%\n    99%\n    ... finished\n    ... finished\nProperty Saturation Initialization ...\n    