In [1]:
import collections
import re

In [2]:
def getClassNameHelper(astring):
    """
    Take a space delimited string, return a class name such as ThisIsAnUnusualName 
    Here we use this function for instance name. Thus it allows to start with a number
    """
    jointName = astring.title().replace(" ","")
    #substitute except for  _, character, number
    nonLegit = re.compile(r'[\W]+')
    className = nonLegit.sub('', jointName)
#     if className[0].isdigit():
#         return className
    return className
    
def getClassName(astring):
    return getClassNameHelper(astring)
def getPropertyName(astring):
    """
    Take a space delimited string, return a property name such as thisIsAnUnusualName 
    """
    className = getClassNameHelper(astring)
    return className[0].lower()+className[1:]

In [31]:
def getPubicationUrl(pubId):
    """return url if url applicable; otherwise return empty string"""
    # IntAct website where the imex applies to doesn't work properly
    # mint publication number doesn't work properly
    pub = pubId.split(":")[0]
    idNum = ":".join(pubId.split(":")[1:])
    if pub == "PMID" or pub == "pmid":
        return "https://pubmed.ncbi.nlm.nih.gov/" + idNum +"/"
    elif pub == "GO":
        return "http://amigo.geneontology.org/amigo/term/GO:"  + idNum
    elif pub == "RESID":
        return "https://annotation.dbi.udel.edu/cgi-bin/resid?id=" + idNum
    elif pub == "doi":
        return "https://doi.org/" + idNum 


In [None]:
class Node():
    def __init__(self, value):
        self.value = value
        self.parentList = []
        # one node can have multiple child nodes
        self.childList = []
def getParentIdList(termList):
    """
    Takes a list with each item containing the information, return a list of idString of parent node. 
    Example:
    term = ['id: MI:0000',
     'name: molecular interaction',
     'def: "Controlled vocabularies originally created for protein protein interactions, extended to other molecules interactions." [PMID:14755292]',
     'subset: Drugable',
     'subset: PSI-MI_slim']
    """
    idStringList = []
    for term in termList:
        '''
        term containining parent information is "is_a: MI:0013 ! biophysical" 
        or "relationship: part_of MI:1349 ! chembl"
        ''' 
        if term.startswith("is_a"):
            idStringList.append(term.split(" ")[1])
        elif term.startswith("relationship"):
            idStringList.append(term.split(" ")[2])
        else: continue
            
    return idStringList

class GetTreeValues():
    
    def __init__(self):
        self.nodeValuesSet = set()

    def getSubsetId(self, node):
        """
        Take the idString of as the root node value, return all the tree nodes value as a set.
        """
        # reset return set to empty
        self.nodeValuesSet = set()
        # run a DFS on the tree
        self.dfs(node)
        return self.nodeValuesSet

    def dfs(self, node):
        """
        Take a tree node, do tree traversal recursively 
        """
        if not node: return
        self.nodeValuesSet.add(node.value)
        for child in node.childList:
            self.dfs(child)

In [None]:
def getSchemaFromText(term, id2node):
    
    """
    Takes a list with each item containing the information, return a list: [data schema, PSI-MI, DCID] 
    Example:
    term = ['id: MI:0000',
     'name: molecular interaction',
     'def: "Controlled vocabularies originally created for protein protein interactions, extended to other molecules interactions." [PMID:14755292]',
     'subset: Drugable',
     'subset: PSI-MI_slim']
    """
    termDic = collections.defaultdict(list)
    for line in term:
        lineList = line.split(": ")
        key = lineList[0]
        value = ": ".join(lineList[1:])
        termDic[key].append(value)
    try:
        defLong = termDic['def'][0]
    except:
        print("No def attribute",term)
    try:
        idStart = defLong.rfind('[')
        description = defLong[1:idStart-1-2]
        termDic['def'] = [description]
        IDString = defLong[idStart+1:-1]
        if len(IDString)>0:
            IDList = defLong[idStart+1:-1].split(", ")
            termDic['references']= IDList
        
    except:
        print(defLong)
    
    schemaPieceList = []
    keyList = ["id", "def","references","parentClassName"]
    
    curLine = "Node: dcid:" + id2className[termDic['id'][0]]
    schemaPieceList.append(curLine)

    
    idString = termDic['id'][0]
    if idString in interactionTypeIdSet:
        curLine = "typeOf: dcs:InteractionTypeEnum"
    elif idString in detectionMethodIdSet:
        curLine = "typeOf: dcs:InteractionDetectionMethodEnum"
    elif idString in interactionSourceIdset:
        curLine = "typeOf: dcs:InteractionSourceEnum"
    else:
        return None
        
    termDic["parentClassName"] = [id2className[node.value] for node in id2node[idString].parentList]
    
    schemaPieceList.append(curLine)
    dcid = id2className[idString]
    curLine = "name: \"" + dcid + "\""
    schemaPieceList.append(curLine)

    '''
    termDic:
    id:  ['MI:0001']
    name:  ['interaction detection method']
    def:  ['Method to determine the interaction']
    subset:  ['Drugable', 'PSI-MI_slim']
    synonym:  ['"interaction detect" EXACT PSI-MI-short []']
    relationship:  ['part_of MI:0000 ! molecular interaction']
    references:  ['PMID:14755292']
    parentClassName:  ['MolecularInteraction']
    '''
    
    for key in keyList:
            
        if key=="def" and len(termDic[key])>0 :

            curLine = "description: \"" + termDic[key][0][0].upper() + termDic[key][0][1:] +"\""
            schemaPieceList.append(curLine)

        elif key=="references" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append( "\"" + getPubicationUrl(termDic[key][i]) + "\"")
            curLine = "references: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="id" and len(termDic[key])>0:       
            curLine = "identifier: \"" + termDic[key][0] + "\""
            schemaPieceList.append(curLine)
            
        elif key=="subset" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append(termDic[key][i])
            curLine = "subsetOf: dcs:"  + ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="synonym" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("\"" + termDic[key][i] + "\"" )
            curLine = "alias: " + ",".join(itemList)
            schemaPieceList.append(curLine)
                
        elif key=="is_obsolete" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append( termDic[key][i]  )
            curLine = "isObsolete: " + ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="parentClassName" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append( "dcs:" + termDic[key][i])
            curLine = "specializationOf: " +  ",".join(itemList)
            schemaPieceList.append(curLine)

    return "\n".join(schemaPieceList), termDic['id'][0], dcid

In [3]:
"""
June 10, 2020
Parsing Steps:
1. build the tree by the psi-mi number. A dictionary {psi-mi: node} is used 
to access nodes as well.
2. save all the tree nodes in the subtree of the three nodes into three set:
    id: MI:0001 name: interaction detection method
    id: MI:0190 name: interaction type
    id: MI:0444 name: database citation
3. save the nodes in the three sets to the corresponding enumearation schema
"""

'\nJune 10, 2020\nParsing Steps:\n1. build the tree by the psi-mi number. A dictionary {psi-mi: node} is used \nto access nodes as well.\n2. save all the tree nodes in the subtree of the three nodes into three set:\n    id: MI:0001 name: interaction detection method\n    id: MI:0190 name: interaction type\n    id: MI:0444 name: database citation\n3. save the nodes in the three sets to the corresponding enumearation schema\n'

In [4]:
with open('mi.owl','r') as fp:
    file = fp.read()

In [5]:
#get the file instruction and save into a dictionary
fileHeader = file.split('\n\n')[0]

In [6]:
fileHeaderDic = collections.defaultdict(list)
pairs = fileHeader.split("\n")
for pair in pairs:
    pairList = pair.split(":")
    key = pairList[0]
    value = ":".join(pairList[1:])
    fileHeaderDic[key].append(value)

In [7]:
fileTerms = file.split('\n\n')[1:]

In [8]:
fileHeaderDic.keys()

dict_keys(['format-version', 'date', 'saved-by', 'auto-generated-by', 'subsetdef', 'synonymtypedef', 'default-namespace', 'remark', 'ontology'])

In [9]:
id2node = {}
id2className = {}
# build nodes and create the id2node dictionary at first iteration
for termText in fileTerms:
    if not termText.startswith("[Term]"):
        continue
    # idString example: "MI:0000"
    idString = termText.split("\n")[1].split(" ")[1]
    className = getClassName(termText.split("\n")[2].split(": ")[1])
    id2className[idString] = className
    id2node[idString] = Node(idString)

In [10]:
# build the parent-child relation at the second iteration
for termText in fileTerms:
    if not termText.startswith("[Term]"):
        continue
    termList = termText.split("\n")
    idString = termList[1].split(" ")[1]
    parentIdList = getParentIdList(termList[1:])
    for pId in parentIdList:
        id2node[pId].childList.append(id2node[idString])
        id2node[idString].parentList.append(id2node[pId])

In [11]:
# get the idStrings for the three target set
dfsCaller = GetTreeValues()
interactionTypeIdSet = dfsCaller.getSubsetId(id2node["MI:0001"]) # root id: MI:0001 
detectionMethodIdSet = dfsCaller.getSubsetId(id2node["MI:0190"])# root id: MI:0190
interactionSourceIdset = dfsCaller.getSubsetId(id2node["MI:0444"]) # root id: MI:0444

In [12]:
# delete root node value from the set
interactionTypeIdSet.remove("MI:0001")
detectionMethodIdSet.remove("MI:0190")
interactionSourceIdset.remove("MI:0444")

In [30]:
setList = [interactionTypeIdSet, detectionMethodIdSet,interactionSourceIdset]
print ([len(s) for s in setList])

[333, 109, 180]


In [33]:
schemaList = []
psimi2dcid = []
schema='''Node: dcid:InteractionTypeEnum\ntypeOf: schema:Class\nsubClassOf: schema:Enumeration\nname: "InteractionTypeEnum"\ndescription: "The interaction type enumeration in biomedical ontologies."\n\nNode: dcid:InteractionDetectionMethodEnum\ntypeOf: schema:Class\nsubClassOf: schema:Enumeration\nname: "InteractionDetectionMethodEnum"\ndescription: "The detection method enumeration in biomedical ontologies."\n\nNode: dcid:InteractionSourceEnum\ntypeOf: schema:Class\nsubClassOf: schema:Enumeration\nname: "InteractionSourceEnum"\ndescription: "The interaction source database enumeration in biomedical ontologies."'''
schemaList.append(schema)
for termText in fileTerms:
    if not termText.startswith("[Term]"):
        continue
    term = termText.split("\n")[1:]
    schemaRes = getSchemaFromText(term, id2node)
    if schemaRes:
        schema, psimi, dcid = schemaRes
        schemaList.append(schema)
        psimi2dcid.append(psimi+': ' + dcid)
schemaEnumText = "\n\n".join(schemaList)
with open('BioOntologySchema.mcf','w') as fp:
    fp.write(schemaEnumText)
with open('psimi2dcid.txt','w') as fp:
    fp.write("\n".join(psimi2dcid))

In [34]:
# dev browser imported name: BioOntologySchema