In [89]:
import collections
import re
#todo: divide the terms into ohters, interactionType, detectionMethod, Interaction Source

In [144]:
def getClassNameHelper(astring):
    """
    Take a space delimited string, return a class name such as ThisIsAnUnusualName 
    """
    jointName = astring.title().replace(" ","")
    #substitute except for  _, character, number
    nonLegit = re.compile(r'[\W]+')
    className = nonLegit.sub('', jointName)
    if className[0].isdigit():
        return "" + className
    return className
def getClassName(astring):
    return "Bio/" + getClassNameHelper(astring)
def getPropertyName(astring):
    """
    Take a space delimited string, return a property name such as thisIsAnUnusualName 
    """
    className = getClassNameHelper(astring)
    return className[0].lower()+className[1:]

In [145]:
"""
June 10, 2020
Parsing Steps:
1. build the tree by the psi-mi number. A dictionary {psi-mi: node} is used 
to access nodes as well.
2. save all the tree nodes in the subtree of the three nodes into three set:
    id: MI:0001 name: interaction detection method
    id: MI:0190 name: interaction type
    id: MI:0444 name: database citation
3. save the nodes in the three sets to the corresponding enumearation schema
"""

'\nJune 10, 2020\nParsing Steps:\n1. build the tree by the psi-mi number. A dictionary {psi-mi: node} is used \nto access nodes as well.\n2. save all the tree nodes in the subtree of the three nodes into three set:\n    id: MI:0001 name: interaction detection method\n    id: MI:0190 name: interaction type\n    id: MI:0444 name: database citation\n3. save the nodes in the three sets to the corresponding enumearation schema\n'

In [146]:
with open('mi.owl','r') as fp:
    file = fp.read()

In [147]:
#get the file instruction and save into a dictionary
fileHeader = file.split('\n\n')[0]

In [148]:
fileHeaderDic = collections.defaultdict(list)
pairs = fileHeader.split("\n")
for pair in pairs:
    pairList = pair.split(":")
    key = pairList[0]
    value = ":".join(pairList[1:])
    fileHeaderDic[key].append(value)

In [149]:
fileTerms = file.split('\n\n')[1:]

In [150]:
fileHeaderDic.keys()

dict_keys(['format-version', 'date', 'saved-by', 'auto-generated-by', 'subsetdef', 'synonymtypedef', 'default-namespace', 'remark', 'ontology'])

In [151]:
#In this version synonym is saved as a string and is not linked to each synonym type.

In [152]:
# we don't import subset property in this version

#Three subsets are saved to enumeration schema 

# schemaPiece = """Node: dcid:BiomedicalOntologySubsetEnum
# typeOf: schema:Enumeration
# name: "BiomedicalOntologySubsetEnum"
# description: \"The subset enumeration in biomedical ontologies\""""
# schemaEnum = [schemaPiece]

# for subsetPair in fileHeaderDic['subsetdef']:
#     subsetList = subsetPair.split()  
#     dcid = subsetList[0]
#     description = ' '.join(subsetList[1:])
#     schemaPiece = 'Node: dcid:' + dcid + '\n'+\
# 'typeOf: dcs:BiomedicalOntologySubsetEnum\nname: “'+dcid+'”\ndescription: '+\
# description
#     schemaEnum.append(schemaPiece)
# schemaEnumText = '\n\n'.join(schemaEnum)

In [153]:
class Node():
    def __init__(self, value):
        self.value = value
        self.parentList = []
        # one node can have multiple child nodes
        self.childList = []
def getParentIdList(termList):
    """
    Takes a list with each item containing the information, return a list of idString of parent node. 
    Example:
    term = ['id: MI:0000',
     'name: molecular interaction',
     'def: "Controlled vocabularies originally created for protein protein interactions, extended to other molecules interactions." [PMID:14755292]',
     'subset: Drugable',
     'subset: PSI-MI_slim']
    """
    idStringList = []
    for term in termList:
        '''
        term containining parent information is "is_a: MI:0013 ! biophysical" 
        or "relationship: part_of MI:1349 ! chembl"
        ''' 
        if term.startswith("is_a"):
            idStringList.append(term.split(" ")[1])
        elif term.startswith("relationship"):
            idStringList.append(term.split(" ")[2])
        else: continue
            
    return idStringList

class GetTreeValues():
    
    def __init__(self):
        self.nodeValuesSet = set()

    def getSubsetId(self, node):
        """
        Take the idString of as the root node value, return all the tree nodes value as a set.
        """
        # reset return set to empty
        self.nodeValuesSet = set()
        # run a DFS on the tree
        self.dfs(node)
        return self.nodeValuesSet

    def dfs(self, node):
        """
        Take a tree node, do tree traversal recursively 
        """
        if not node: return
        self.nodeValuesSet.add(node.value)
        for child in node.childList:
            self.dfs(child)
        
    
id2node = {}
id2className = {}
# build nodes and create the id2node dictionary at first iteration
for termText in fileTerms:
    if not termText.startswith("[Term]"):
        continue
    # idString example: "MI:0000"
    idString = termText.split("\n")[1].split(" ")[1]
    className = getClassName(termText.split("\n")[2].split(": ")[1])
    id2className[idString] = className
    id2node[idString] = Node(idString)

In [154]:
# build the parent-child relation at the second iteration
for termText in fileTerms:
    if not termText.startswith("[Term]"):
        continue
    termList = termText.split("\n")
    idString = termList[1].split(" ")[1]
    parentIdList = getParentIdList(termList[1:])
    for pId in parentIdList:
        id2node[pId].childList.append(id2node[idString])
        id2node[idString].parentList.append(id2node[pId])

In [155]:
# get the idStrings for the three target set
dfsCaller = GetTreeValues()
interactionTypeIdSet = dfsCaller.getSubsetId(id2node["MI:0001"]) # root id: MI:0001 
detectionMethodIdSet = dfsCaller.getSubsetId(id2node["MI:0190"])# root id: MI:0190
interactionSourceIdset = dfsCaller.getSubsetId(id2node["MI:0444"]) # root id: MI:0444

In [156]:
# delete root node value from the set
interactionTypeIdSet.remove("MI:0001")
detectionMethodIdSet.remove("MI:0190")
interactionSourceIdset.remove("MI:0444")

In [157]:
setList = [interactionTypeIdSet, detectionMethodIdSet,interactionSourceIdset]
print ([len(s) for s in setList])

[333, 109, 180]


In [158]:
def getSchemaFromText(term, id2node):
    
    """
    Takes a list with each item containing the information, return a data schema. 
    Example:
    term = ['id: MI:0000',
     'name: molecular interaction',
     'def: "Controlled vocabularies originally created for protein protein interactions, extended to other molecules interactions." [PMID:14755292]',
     'subset: Drugable',
     'subset: PSI-MI_slim']
    """
    termDic = collections.defaultdict(list)
    for line in term:
        lineList = line.split(": ")
        key = lineList[0]
        value = ": ".join(lineList[1:])
        termDic[key].append(value)
    try:
        defLong = termDic['def'][0]
    except:
        print("No def attribute",term)
    try:
        idStart = defLong.rfind('[')
        description = defLong[1:idStart-1-2]
        termDic['def'] = [description]
        IDString = defLong[idStart+1:-1]
        if len(IDString)>0:
            IDList = defLong[idStart+1:-1].split(", ")
            termDic['publicationsId']= IDList
        
    except:
        print(defLong)
    
    schemaPieceList = []
    #keyList = ["id", "def","publicationsId","subset","synonym","is_obsolete","parentClassName"]
    keyList = ["id", "def","publicationsId","parentClassName"]
    
    curLine = "Node: dcid:" + id2className[termDic['id'][0]]
    schemaPieceList.append(curLine)
    
    curLine = "typeOf: schema:Class"
    schemaPieceList.append(curLine)
    
    idString = termDic['id'][0]
    if idString in interactionTypeIdSet:
        curLine = "subClassOf: dcs:InteractionTypeEnum"
    elif idString in detectionMethodIdSet:
        curLine = "subClassOf: dcs:InteractionDetectionMethodEnum"
    elif idString in interactionSourceIdset:
        curLine = "subClassOf: dcs:InteractionSourceEnum"
    else:
        return None
        
    termDic["parentClassName"] = [id2className[node.value] for node in id2node[idString].parentList]
    
    schemaPieceList.append(curLine)
    curLine = "name: \"" + id2className[idString] + "\""
    schemaPieceList.append(curLine)

    '''
    termDic:
    id:  ['MI:0001']
    name:  ['interaction detection method']
    def:  ['Method to determine the interaction']
    subset:  ['Drugable', 'PSI-MI_slim']
    synonym:  ['"interaction detect" EXACT PSI-MI-short []']
    relationship:  ['part_of MI:0000 ! molecular interaction']
    publicationsId:  ['PMID:14755292']
    parentClassName:  ['MolecularInteraction']
    '''

    for key in keyList:
            
        if key=="def" and len(termDic[key])>0 :

            curLine = "description: \"" + termDic[key][0][0].upper() + termDic[key][0][1:] +"\""
            schemaPieceList.append(curLine)

        elif key=="publicationsId" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append( "\"" + termDic[key][i] + "\"")
            curLine = "publicationsId: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="id" and len(termDic[key])>0:       
            curLine = "psimiId: \"" + termDic[key][0] + "\""
            schemaPieceList.append(curLine)
            
        elif key=="subset" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append(termDic[key][i])
            curLine = "subsetOf: dcs:"  + ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="synonym" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("\"" + termDic[key][i] + "\"" )
            curLine = "alias: " + ",".join(itemList)
            schemaPieceList.append(curLine)
                
        elif key=="is_obsolete" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append( termDic[key][i]  )
            curLine = "isObsolete: " + ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="parentClassName" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append( "dcs:" + termDic[key][i])
            curLine = "specializationOf: " +  ",".join(itemList)
            schemaPieceList.append(curLine)

    return "\n".join(schemaPieceList)

In [159]:
schemaList = []
schema ='''Node: dcid:ProteinProteinInteraction\ntypeOf: schema:Class\nname:"ProteinProteinInteraction"\nsubClassOf: dcs:Protein\ndescription: "An interaction between proteins."\n\nNode: dcid:psimiId\ntypeOf: schema:Property\nname: "psimiId"\ndescription: "The identifier for Proteomics Standards Initiative Molecular Interaction."\nrangeIncludes: schema:Text\ndomainIncludes:dcs:InteractionTypeEnum,dcs:InteractionDetectionMethodEnum,dcs:InteractionSourceEnum\n\nNode: dcid:publicationId\ntypeOf: schema:Property\nname: "publicationId"\ndescription: "Source and ID of the reference."\nrangeIncludes: schema:Text\ndomainIncludes: dcs:InteractionTypeEnum,dcs:InteractionDetectionMethodEnum,dcs:InteractionSourceEnum,dcs:ProteinProteinInteraction\n\nNode: dcid:InteractionTypeEnum\ntypeOf: schema:Class\nsubClassOf: schema:Enumeration\nname: "InteractionTypeEnum"\ndescription: "The interaction type enumeration in biomedical ontologies."\n\nNode: dcid:InteractionDetectionMethodEnum\ntypeOf: schema:Class\nsubClassOf: schema:Enumeration\nname: "InteractionDetectionMethodEnum"\ndescription: "The detection method enumeration in biomedical ontologies."\n\nNode: dcid:InteractionSourceEnum\ntypeOf: schema:Class\nsubClassOf: schema:Enumeration\nname: "InteractionSourceEnum"\ndescription: "The interaction source database enumeration in biomedical ontologies."'''
schemaList.append(schema)
for termText in fileTerms:
    if not termText.startswith("[Term]"):
        continue
    term = termText.split("\n")[1:]
    schema = getSchemaFromText(term, id2node)
    if schema:
        schemaList.append(schema)
#     count+=1
#     if count>5:
#         break
schemaEnumText = "\n\n".join(schemaList)
with open('BioOntologySchema.mcf','w') as fp:
    fp.write(schemaEnumText)

In [84]:
s ='''Node: dcid:ProteinProteinInteraction
typeOf: Class
name:"ProteinProteinInteraction"
subClassOf: dcs:Protein
description: "An interaction between proteins."

Node: dcid:psimiId
typeOf: schema:Property
name: "psimiId"
description: "The identifier for Proteomics Standards Initiative Molecular Interaction."
rangeIncludes: schema:Text
domainIncludes:dcs:InteractionTypeEnum,dcs:InteractionDetectionMethodEnum,dcs:InteractionSourceEnum

Node: dcid:publicationId
typeOf: schema:Property
name: "publicationId"
description: "Source and ID of the reference."
rangeIncludes: schema:Text
domainIncludes: dcs:InteractionTypeEnum,dcs:InteractionDetectionMethodEnum,dcs:InteractionSourceEnum,dcs:ProteinProteinInteraction

Node: dcid:InteractionTypeEnum
typeOf: schema:Class
subClassOf: schema:Enumeration
name: "InteractionTypeEnum"
description: "The interaction type enumeration in biomedical ontologies."

Node: dcid:InteractionDetectionMethodEnum
typeOf: schema:Class
subClassOf: schema:Enumeration
name: "InteractionDetectionMethodEnum"
description: "The detection method enumeration in biomedical ontologies."

Node: dcid:InteractionSourceEnum
typeOf: schema:Class
subClassOf: schema:Enumeration
name: "InteractionSourceEnum"
description: "The interaction source database enumeration in biomedical ontologies."'''