In [30]:
import collections
import re

In [44]:
def getPubicationUrl(pubId):
    """return url if url is applicable; otherwise return empty string"""
    # IntAct website where the imex applies to doesn't work properly
    # mint publication number doesn't work properly
    pub = pubId.split(":")[0]
    idNum = ":".join(pubId.split(":")[1:])
    if pub == "pubmed":
        return "https://pubmed.ncbi.nlm.nih.gov/" + idNum +"/"
    elif pub == "imex":
        return ""
    elif pub == "mint":
        return ""
    elif pub == "doi":
        return "https://doi.org/" + idNum +".x"
    elif pub == "rcsb pdb":
        return "https://www.rcsb.org/structure/" + idNum
        
    
def getIdentifierUrl(sourceId):
    """return url if url is applicable; otherwise return empty string"""
    #"imex" identifier url doesn't work
    source = sourceId.split(":")[0]
    idNum = ":".join(sourceId.split(":")[1:])
    if source == "intact":
        return "https://www.ebi.ac.uk/intact/interaction/" + idNum
    elif source == "mint":
        return "https://mint.bio.uniroma2.it/index.php/detailed-curation/?id=" + idNum
    elif source == "emdb":
        return "https://www.ebi.ac.uk/pdbe/entry/emdb/" + idNum    
    elif source == "wwpdb" or source == "rcsb pdb":
        return "https://www.rcsb.org/structure/" + idNum
    elif source == "psi-mi":
        return "dcs:"+psimi2dcid[idNum[1:-1]]
    elif source == "reactome":
        return ""
    elif source == "pdbe":
        return "https://www.ebi.ac.uk/pdbe/entry/pdb/" + idNum
    else:
        return ""
        
        
        

In [45]:
def getProteinDcid(mintAliases):
    """
    Takes a string from the mint database, return the dcid of the protein.
    The mintAliases line contains the aliases of the protein. The display_long.upper() is the
    dcid of the participant protein.
    """
    if len(mintAliases)>1:
        return mintAliases.split("|")[0].split(":")[1].split('(')[0].upper()
    else:
        # for a self-interacting protein, one of the protein name is empty, denoted by "-" 
        return None

In [46]:
def checkUniprot(alias):
    """
    Return True if the protein has UniProt identifier
    """
    
    return len(alias)==1 or alias.split(":")[0] == "uniprotkb"

In [47]:
def checkDcid(alias):
    """
    if alias == '-': return 1
    elif it contains the "display_long", which the protein name in UniProt, and it
        has the right format(contains only number, char, "_"), has two parts separated 
        by "_".
    else return 0
    """
    if len(alias) == 1:
        return 1
    aliasList = alias.split("|")
    aliasDic = {}
    for ali in aliasList:
        key = ali.split("(")[1][:-1]
        value = ali.split("(")[0].split(":")[1]
        aliasDic[key] = value
    if "display_long" in aliasDic:
        dcid = aliasDic["display_long"]
        if re.search("[\W]+", dcid)!=None or len(dcid.split("_"))!=2:
            return 0
        
    else:
        return 2
        

In [48]:
def getSchemaFromText(term):
    
    """
    Takes a list with each item containing the information, return a data schema. 
    """
    termDic = collections.defaultdict(list)
    protein = getProteinDcid(term[4])
    if protein:
        termDic['interactingProtein'].append(protein)
    protein = getProteinDcid(term[5])
    if protein:
        termDic['interactingProtein'].append(protein)
    detectionMethod = psimi2dcid[term[6].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionDetectionMethod'].append(detectionMethod)
    termDic['references'] = [getPubicationUrl(x) for x in term[8].split("|")]
    interactionType = psimi2dcid[term[11].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionType'].append(interactionType)
    interactionSource =  psimi2dcid[term[12].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionSource'].append(interactionSource)
    termDic['identifier'] = [getIdentifierUrl(x) for x in term[13].split("|")]
    confidence = term[14]
    if confidence!= "-":
        termDic['confidence'].append(term[14])

    '''
    termDic example:
    interactingProtein:  ['RPN1_YEAST', 'RPN3_YEAST']
    interactionDetectionMethod:  ['TandemAffinityPurification']
    references:  ['pubmed:16554755', 'imex:IM-15332', 'mint:MINT-5218454']
    interactionType:  ['PhysicalAssociation']
    interactionSource:  ['Mint']
    identifier:  ['intact:EBI-6941860', 'mint:MINT-1984371', 'imex:IM-15332-8532']
    confidence:  ['intact-miscore:0.76']
    '''
    schemaPieceList = []
    keyList = ["interactingProtein", "interactionDetectionMethod","interactionType","interactionSource", \
               "identifier", "confidence","references"]
    if len(termDic["interactingProtein"])>1:
        dcid = termDic["interactingProtein"][0] + "_" + termDic["interactingProtein"][1]
    else:
        dcid = termDic["interactingProtein"][0] + "_" + termDic["interactingProtein"][0]
    curLine = "Node: dcid:" + dcid
    schemaPieceList.append(curLine)
    curLine = "typeOf: ProteinProteinInteraction"
    schemaPieceList.append(curLine)
    curLine = "name: " + "\"" + dcid + "\""
    schemaPieceList.append(curLine)

    for key in keyList:
            
        if key=="interactingProtein" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:bio/UniProt_" + termDic[key][i])
            curLine = "interactingProtein: " +  ",".join(itemList)
            schemaPieceList.append(curLine)

        elif key=="interactionDetectionMethod" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionDetectionMethod: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="interactionType" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionType: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="interactionSource" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionSource: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="references" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                if termDic[key][i]!="":
                    itemList.append( "\"" + termDic[key][i] + "\"")
            curLine = "references: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="identifier" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                if termDic[key][i]!="":
                    itemList.append( "\"" + termDic[key][i] + "\"")
            curLine = "identifier: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="confidence" and len(termDic[key])>0:       
            curLine = "confidence: \"" + termDic[key][0] + "\""
            schemaPieceList.append(curLine)


    return "\n".join(schemaPieceList)

In [49]:
with open('./mint_database', 'r') as fp:
    file = fp.read()
# read the file which has paired PSI-MI and DCID, this file generated from EBI MI Ontology
with open('../proteinInteractionEBI/psimi2dcid.txt','r') as fp:
    p2d = fp.read()

In [50]:
lines = file.split('\n')

In [51]:
psimi2dcid = {}
p2d = [line.split(": ") for line in p2d.split("\n")]
for line in p2d:
    psimi2dcid[line[0]] = line[1]

In [52]:
schemaList = []
schema='''Node: dcid:ProteinProteinInteraction\nname: "ProteinProteinInteraction"\ntypeOf: schema:Class\nsubClassOf: schema:Thing\ndescription: "The Interaction between proteins."\n\nNode: dcid:interactingProtein\ntypeOf: schema:Property\nname: "interactingProtein"\ndescription: "The participant proteins in protein-protein interaction."\nrangeIncludes: dcs:Protein\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionDetectionMethod\ntypeOf: schema:Property\nname: "interactionDetectionMethod"\ndescription: "The interaction detection method used in the experiment"\nrangeIncludes: dcs:InteractionDetectionMethodEnum\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionType\ntypeOf: schema:Property\nname: "interactionType"\ndescription: "The molecular interaction type"\nrangeIncludes: dcs:InteractionTypeEnum\ndomainIncludes: dcs:ProteinProteinInteraction\n\nNode: dcid:interactionSource\ntypeOf: schema:Property\nname: "interactionSource"\ndescription: "The database where the interaction record extracted"\nrangeIncludes: dcs:InteractionSourceEnum\ndomainIncludes: dcs:ProteinProteinInteraction'''
schemaList.append(schema)
wrongDcid = []
failed = []
noUniprot = []
for line in lines:
    
    if len(line) == 0:
        continue
        
    term = line.split('\t')
    
    # check if record has correct UniProt Protein Name
    c1, c2 = checkDcid(term[4]), checkDcid(term[5])
    
    if c1==0 or c2==0:
        wrongDcid.append(line)
        continue
    
    # check if record has Uniprot Identifier
    u1, u2 = checkUniprot(term[0]), checkUniprot(term[1])
    if not u1 or not u2:
        noUniprot.append(line)
        continue
    
    try:
        schema = getSchemaFromText(term)
    except:
        failed.append(line)
        continue
        
    if schema:
        schemaList.append(schema)



In [53]:
schemaEnumText = "\n\n".join(schemaList[:20])
with open('BioMINTSchema.mcf','w') as fp:
    fp.write(schemaEnumText)

In [61]:
# Imported records number
len(schemaList)-1

129585

In [57]:
# the number of records we didn't import
fCount = 0
for alist in [wrongDcid,noUniprot, failed]:
    print(len(alist))
    fCount += len(alist)

3540
42
0


In [58]:
# the whole schema is too large to upload to dev browser at once. Split into 3 parts.
count = 1
for i in range(0,len(schemaList), 44375):
    schemaEnumText = "\n\n".join(schemaList[i:i+44375])   
    with open('BioMINTSchema_part'+str(count)+'.mcf','w') as fp:
        fp.write(schemaEnumText)
    count += 1

In [59]:
# Show all the publications and identifier source examples.

publications = {}
identifier = {}
pCount = collections.defaultdict(int)
iCount = collections.defaultdict(int)
for line in lines:
    if len(line) == 0:
        continue
    term = line.split('\t')
    try:
        ps = term[8].split("|")
    except:
        print(term[8])
    for p in ps:
        if p.split(":")[0] not in publications:
            publications[p.split(":")[0]] = "".join(p.split(":")[1:])
        pCount[p.split(":")[0]] += 1
    ids = term[13].split("|") 
    for i in ids:
        if i.split(":")[0] not in identifier:
            identifier[i.split(":")[0]] = "".join(i.split(":")[1:])
        iCount[i.split(":")[0]] += 1


In [60]:
publications, identifier

({'pubmed': '16554755',
  'imex': 'IM-15332',
  'mint': 'MINT-5218454',
  'doi': '10.1046/j.1365-2443.2002.00589',
  'rcsb pdb': '4lep'},
 {'intact': 'EBI-6941860',
  'mint': 'MINT-1984371',
  'imex': 'IM-15332-8532',
  'emdb': 'EMD-1191',
  'wwpdb': '3blr',
  'psi-mi': '"MI0471"',
  'rcsb pdb': '1JL4',
  'reactome': 'REACT_3482.1',
  'pdbe': '4bht'})