In [1]:
import collections

In [97]:
with open('./mint_database', 'r') as fp:
    file = fp.read()
with open('../proteinInteractionEBI/psimi2dcid.txt','r') as fp:
    p2d = fp.read()

In [98]:
lines = file.split('\n')

In [99]:
psimi2dcid = {}
p2d = [line.split(": ") for line in p2d.split("\n")]
for line in p2d:
    psimi2dcid[line[0]] = line[1]

In [100]:
def getProteinDcid(mintAliases):
    """
    Takes a string from the mint database, return the dcid of the protein.
    The mintAliases line contains the aliases of the protein. The display_long.upper() is the
    dcid of the participant protein.
    """
    return mintAliases.split("|")[0].split(":")[1].split('(')[0].upper()

In [101]:
def getSchemaFromText(term):
    
    """
    Takes a list with each item containing the information, return a data schema. 
    """
    termDic = collections.defaultdict(list)   
    termDic['interactingProtein'].append(getProteinDcid(term[4]))
    termDic['interactingProtein'].append(getProteinDcid(term[5]))
    detectionMethod = psimi2dcid[term[6].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionDetectionMethod'].append(detectionMethod)
    termDic['references'] = term[8].split("|")
    interactionType = psimi2dcid[term[11].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionType'].append(interactionType)
    interactionSource =  psimi2dcid[term[12].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionSource'].append(interactionSource)
    termDic['identifier'] = term[13].split("|")
    termDic['confidence'].append(term[14])

    '''
    interactingProtein:  ['RPN1_YEAST', 'RPN3_YEAST']
    interactionDetectionMethod:  ['TandemAffinityPurification']
    references:  ['pubmed:16554755', 'imex:IM-15332', 'mint:MINT-5218454']
    interactionType:  ['PhysicalAssociation']
    interactionSource:  ['Mint']
    identifier:  ['intact:EBI-6941860', 'mint:MINT-1984371', 'imex:IM-15332-8532']
    confidence:  ['intact-miscore:0.76']
    '''
    schemaPieceList = []
    keyList = ["interactingProtein", "interactionDetectionMethod","interactionType","interactionSource","confidence","references"]
    
    dcid = termDic["interactingProtein"][0] + "_" + termDic["interactingProtein"][1]
    curLine = "Node: dcid:" + dcid
    schemaPieceList.append(curLine)
    curLine = "typeOf: ProteinProteinInteraction"
    schemaPieceList.append(curLine)
    curLine = "name: " + "\"" + dcid + "\""
    schemaPieceList.append(curLine)

    for key in keyList:
            
        if key=="interactingProtein" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactingProtein: " +  ",".join(itemList)
            schemaPieceList.append(curLine)

        elif key=="interactionDetectionMethod" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionDetectionMethod: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="interactionType" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionType: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="interactionSource" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionSource: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="references" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append( "\"" + termDic[key][i] + "\"")
            curLine = "references: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="confidence" and len(termDic[key])>0:       
            curLine = "confidence: \"" + termDic[key][0] + "\""
            schemaPieceList.append(curLine)


    return "\n".join(schemaPieceList)

In [109]:
schemaList = []
schema='''Node: dcid:ProteinProteinInteraction\nname: "ProteinProteinInteraction"\ntypeOf: schema:Class\nsubClassOf: schema:Thing\ndescription: "The Interaction between proteins."\n\nNode: dcid:interactingProtein\ntypeOf: schema:Property\nname: "interactingProtein"\ndescription: "The participant proteins in protein-protein interaction."\nrangeIncludes: dcs:Protein\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionDetectionMethod\ntypeOf: schema:Property\nname: "interactionDetectionMethod"\ndescription: "The interaction detection method used in the experiment"\nrangeIncludes: dcs:InteractionDetectionMethodEnum\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionType\ntypeOf: schema:Property\nname: "interactionType"\ndescription: "The molecular interaction type"\nrangeIncludes: dcs:InteractionTypeEnum\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionSource\ntypeOf: schema:Property\nname: "interactionSource"\ndescription: "The database where the interaction record extracted"\nrangeIncludes: dcs:InteractionSourceEnum\ndomainIncludes: dcs:ProteinProteinInteraction'''
schemaList.append(schema)
oneProteinInteraction = []
for idx,line in enumerate(lines):
    if len(line) == 0:
        continue
    term = line.split('\t')
    try:
        schema = getSchemaFromText(term)
    except:
        oneProteinInteraction.append(line)
        continue
        
    if schema:
        schemaList.append(schema)

schemaEnumText = "\n\n".join(schemaList)

In [112]:
with open('BioMINTSchema.mcf','w') as fp:
    fp.write(schemaEnumText)

In [110]:
len(schemaList)

133125

In [57]:
s = '''Node: dcid:ProteinProteinInteraction
name: "ProteinProteinInteraction"
typeOf: schema:Class
subClassOf: schema:Thing
description: "The Interaction between proteins."

Node: dcid:interactingProtein
typeOf: schema:Property
name: "interactingProtein"
description: "The participant proteins in protein-protein interaction."
rangeIncludes: dcs:Protein
domainIncludes: dcs:ProteinProteinInteraction

Node: dcid:interactionDetectionMethod
typeOf: schema:Property
name: "interactionDetectionMethod"
description: "The interaction detection method used in the experiment"
rangeIncludes: dcs:InteractionDetectionMethodEnum
domainIncludes: dcs:ProteinProteinInteraction

Node: dcid:interactionType
typeOf: schema:Property
name: "interactionType"
description: "The molecular interaction type"
rangeIncludes: dcs:InteractionTypeEnum
domainIncludes: dcs:ProteinProteinInteraction

Node: dcid:interactionSource
typeOf: schema:Property
name: "interactionSource"
description: "The database where the interaction record extracted"
rangeIncludes: dcs:InteractionSourceEnum
domainIncludes: dcs:ProteinProteinInteraction'''

In [111]:
news = s.replace("“","\"")
nnews = news.replace("”","\"")

In [59]:
nnews

'Node: dcid:ProteinProteinInteraction\nname: "ProteinProteinInteraction"\ntypeOf: schema:Class\nsubClassOf: schema:Thing\ndescription: "The Interaction between proteins."\n\nNode: dcid:interactingProtein\ntypeOf: schema:Property\nname: "interactingProtein"\ndescription: "The participant proteins in protein-protein interaction."\nrangeIncludes: dcs:Protein\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionDetectionMethod\ntypeOf: schema:Property\nname: "interactionDetectionMethod"\ndescription: "The interaction detection method used in the experiment"\nrangeIncludes: dcs:InteractionDetectionMethodEnum\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionType\ntypeOf: schema:Property\nname: "interactionType"\ndescription: "The molecular interaction type"\nrangeIncludes: dcs:InteractionTypeEnum\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionSource\ntypeOf: schema:Property\nname: "interactionSource"\ndescription: "The databa