In [218]:
import collections
import re

In [219]:
newReference = {}
newIdentifier = {}
newScore = {}

In [220]:
def getReferences(term):
    source = term.split(":")[0]
    idNum = ":".join(term.split(":")[1:])
    if source == "pubmed":
        return "pubMedID: " + "\"" + idNum +"\""
    elif source == "imex":
        return "imexID: " + "\"" + idNum +"\""
    elif source == "mint":
        return "mintID: " + "\"" + idNum +"\""
    elif source == "doi":
        return "digitalObjectID: " + "\"" + idNum +"\""
    elif source == "rcsb pdb":
        return "rcsbPDBID: " + "\"" + idNum +"\""
    else:
        newReference[source] = idNum
        return None
    
    
    
def getIdentifier(term):
    source = term.split(":")[0]
    idNum = ":".join(term.split(":")[1:])
    if source == "intact":
        return "intActID: " + "\"" + idNum +"\""
    elif source == "mint":
        return "mintID: " + "\"" + idNum +"\""
    elif source == "imex":
        return "imexID: " + "\"" + idNum +"\""
    elif source == "emdb":
        return "electronMicroscopyDataBankID: " + "\"" + idNum +"\""   
    elif source == "wwpdb":
        return "worldWideProteinDataBankID: " + "\"" + idNum +"\""
    elif source == "rcsb pdb":
        return "rcsbPDBID: " + "\"" + idNum +"\""
    elif source == "psi-mi":
        return "psimiID: " + "\"" + idNum[1:-1] +"\""
    elif source == "reactome":
        return "reactomePathwayID: " + "\"" + idNum +"\""
    elif source == "pdbe":
        return "proteinDataBankInEuropeID: "  + "\"" + idNum +"\""
    else:
        newIdentifier[source] = idNum
        return None
    
def getConfidence(term):
    source = term.split(":")[0]
    idNum = ":".join(term.split(":")[1:])
    if source == "author score":
        return "["+ idNum + " dcs:AuthorScore" +  "]"
    elif source == "intact-miscore":
        return "["+ idNum + " dcs:IntactMiScore" +  "]"
    else:
        newScore[source] = idNum
        return None

    

In [221]:
def getPubicationUrl(pubId):
    """return url if url is applicable; otherwise return empty string"""
    # IntAct website where the imex applies to doesn't work properly
    # mint publication number doesn't work properly
    pub = pubId.split(":")[0]
    idNum = ":".join(pubId.split(":")[1:])
    if pub == "pubmed":
        return "https://pubmed.ncbi.nlm.nih.gov/" + idNum +"/"
    elif pub == "imex":
        return ""
    elif pub == "mint":
        return ""
    elif pub == "doi":
        return "https://doi.org/" + idNum +".x"
    elif pub == "rcsb pdb":
        return "https://www.rcsb.org/structure/" + idNum
        
    
def getIdentifierUrl(sourceId):
    """return url if url is applicable; otherwise return empty string"""
    #"imex" identifier url doesn't work
    source = sourceId.split(":")[0]
    idNum = ":".join(sourceId.split(":")[1:])
    if source == "intact":
        return "https://www.ebi.ac.uk/intact/interaction/" + idNum
    elif source == "mint":
        return "https://mint.bio.uniroma2.it/index.php/detailed-curation/?id=" + idNum
    elif source == "emdb":
        return "https://www.ebi.ac.uk/pdbe/entry/emdb/" + idNum    
    elif source == "wwpdb" or source == "rcsb pdb":
        return "https://www.rcsb.org/structure/" + idNum
    elif source == "psi-mi":
        return "dcs:"+psimi2dcid[idNum[1:-1]]
    elif source == "reactome":
        return ""
    elif source == "pdbe":
        return "https://www.ebi.ac.uk/pdbe/entry/pdb/" + idNum
    else:
        return ""
        
        
        

In [222]:
def getProteinDcid(mintAliases):
    """
    Takes a string from the mint database, return the dcid of the protein.
    The mintAliases line contains the aliases of the protein. The display_long.upper() is the
    dcid of the participant protein.
    """
    if len(mintAliases)>1:
        return mintAliases.split("|")[0].split(":")[1].split('(')[0].upper()
    else:
        # for a self-interacting protein, one of the protein name is empty, denoted by "-" 
        return None

In [223]:
def checkUniprot(alias):
    """
    Return True if the protein has UniProt identifier
    """
    
    return len(alias)==1 or alias.split(":")[0] == "uniprotkb"

In [224]:
def checkDcid(alias):
    """
    if alias == '-': return 1
    elif it contains the "display_long", which the protein name in UniProt, and it
        has the right format(contains only number, char, "_"), has two parts separated 
        by "_".
    else return 0
    """
    if len(alias) == 1:
        return 1
    aliasList = alias.split("|")
    aliasDic = {}
    for ali in aliasList:
        key = ali.split("(")[1][:-1]
        value = ali.split("(")[0].split(":")[1]
        aliasDic[key] = value
    if "display_long" in aliasDic:
        dcid = aliasDic["display_long"]
        if re.search("[\W]+", dcid)!=None or len(dcid.split("_"))!=2:
            return 0
        
    else:
        return 2
        

In [225]:
confidenceSet = collections.defaultdict(list)
confidenceSetCount = collections.defaultdict(int)

In [226]:
def getSchemaFromText(term):
    
    """
    Takes a list with each item containing the information, return a data schema. 
    """
    termDic = collections.defaultdict(list)
    protein = getProteinDcid(term[4])
    if protein:
        termDic['interactingProtein'].append(protein)
    protein = getProteinDcid(term[5])
    if protein:
        termDic['interactingProtein'].append(protein)
    detectionMethod = psimi2dcid[term[6].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionDetectionMethod'].append(detectionMethod)
    termDic['references'] = term[8].split("|")
    interactionType = psimi2dcid[term[11].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionType'].append(interactionType)
    interactionSource =  psimi2dcid[term[12].split(":\"")[1].split("(")[0][:-1]]
    termDic['interactionSource'].append(interactionSource)
    termDic['identifier'] = term[13].split("|")
    confidence = term[14]
    if confidence!= "-":
        termDic['confidence']=term[14].split("|")
#         confidenceSet[term[14].split(":")[0]].append(term[14].split(":")[1:])
#         confidenceSetCount[term[14].split(":")[0]] += 1

    '''
    termDic example:
    interactingProtein:  ['RPN1_YEAST', 'RPN3_YEAST']
    interactionDetectionMethod:  ['TandemAffinityPurification']
    references:  ['pubmed:16554755', 'imex:IM-15332', 'mint:MINT-5218454']
    interactionType:  ['PhysicalAssociation']
    interactionSource:  ['Mint']
    identifier:  ['intact:EBI-6941860', 'mint:MINT-1984371', 'imex:IM-15332-8532']
    confidence:  ['intact-miscore:0.76']
    '''
    schemaPieceList = []
    keyList = ["interactingProtein", "interactionDetectionMethod","interactionType","interactionSource", \
               "identifier", "confidence","references"]
    if len(termDic["interactingProtein"])>1:
        dcid = termDic["interactingProtein"][0] + "_" + termDic["interactingProtein"][1]
    else:
        dcid = termDic["interactingProtein"][0] + "_" + termDic["interactingProtein"][0]
    curLine = "Node: dcid:bio/" + dcid
    schemaPieceList.append(curLine)
    curLine = "typeOf: ProteinProteinInteraction"
    schemaPieceList.append(curLine)
    curLine = "name: " + "\"" + dcid + "\""
    schemaPieceList.append(curLine)

    for key in keyList:
            
        if key=="interactingProtein" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:bio/UniProt_" + termDic[key][i])
            curLine = "interactingProtein: " +  ",".join(itemList)
            schemaPieceList.append(curLine)

        elif key=="interactionDetectionMethod" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionDetectionMethod: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="interactionType" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionType: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="interactionSource" and len(termDic[key])>0 :
            itemList = []
            for i in range(len(termDic[key])):
                itemList.append("dcs:" + termDic[key][i])
            curLine = "interactionSource: " +  ",".join(itemList)
            schemaPieceList.append(curLine)
            
        elif key=="references" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                if termDic[key][i]!="":
                    curLine = getReferences(termDic[key][i])
                    if curLine:
                        schemaPieceList.append(curLine)
            
        elif key=="identifier" and len(termDic[key])>0:
            itemList = []
            for i in range(len(termDic[key])):
                if termDic[key][i]!="":                  
                    curLine = getIdentifier(termDic[key][i])
                    if curLine:
                        schemaPieceList.append(curLine)
            
            
        elif key=="confidence" and len(termDic[key])>0:       
            itemList = []
            for i in range(len(termDic[key])):
                if termDic[key][i]!="":                  
                    itemList.append(getConfidence(termDic[key][i]))
            curLine = "confidenceScore: " +  ",".join(itemList)        
            schemaPieceList.append(curLine)
            
    return "\n".join(schemaPieceList)

In [227]:
newReference,newIdentifier,newScore

({}, {}, {})

In [228]:
print(getSchemaFromText(lines[6].split('\t')))

Node: dcid:bio/PRS10_YEAST_RPN3_YEAST
typeOf: ProteinProteinInteraction
name: "PRS10_YEAST_RPN3_YEAST"
interactingProtein: dcs:bio/UniProt_PRS10_YEAST,dcs:bio/UniProt_RPN3_YEAST
interactionDetectionMethod: dcs:TandemAffinityPurification
interactionType: dcs:PhysicalAssociation
interactionSource: dcs:Mint
intActID: "EBI-6941956"
mintID: "MINT-1984479"
imexID: "IM-15332-4121"
confidenceScore: [0.55 dcs:IntactMiScore]
pubMedID: "16554755"
imexID: "IM-15332"
mintID: "MINT-5218454"


In [229]:
with open('./mint_database', 'r') as fp:
    file = fp.read()
# read the file which has paired PSI-MI and DCID, this file generated from EBI MI Ontology
with open('../proteinInteractionEBI/psimi2dcid.txt','r') as fp:
    p2d = fp.read()

In [230]:
lines = file.split('\n')

In [231]:
psimi2dcid = {}
p2d = [line.split(": ") for line in p2d.split("\n")]
for line in p2d:
    psimi2dcid[line[0]] = line[1]

In [232]:
with open('schemaMCF.mcf','r') as fp:
    schema = fp.read()
schema = schema.replace("“",'"')
schema = schema.replace("”",'"')

In [233]:
schemaList = []
schemaList.append(schema)
wrongDcid = []
failed = []
noUniprot = []
for line in lines:
    
    if len(line) == 0:
        continue
        
    term = line.split('\t')
    
    # check if record has correct UniProt Protein Name
    c1, c2 = checkDcid(term[4]), checkDcid(term[5])
    
    if c1==0 or c2==0:
        wrongDcid.append(line)
        continue
    
    # check if record has Uniprot Identifier
    u1, u2 = checkUniprot(term[0]), checkUniprot(term[1])
    if not u1 or not u2:
        noUniprot.append(line)
        continue
    
    try:
        schema = getSchemaFromText(term)
    except:
        failed.append(line)
        continue
        
    if schema:
        schemaList.append(schema)



In [234]:
len(schemaList)

129586

In [235]:
schemaEnumText = "\n\n".join(schemaList[:20])
with open('BioMINTSchema.mcf','w') as fp:
    fp.write(schemaEnumText)

In [236]:
# Imported records number
len(schemaList)-1

129585

In [237]:
# the number of records we didn't import
fCount = 0
for alist in [wrongDcid,noUniprot, failed]:
    print(len(alist))
    fCount += len(alist)

3540
42
0


In [238]:
print(schemaList[20])

Node: dcid:bio/RPN8_YEAST_RPN3_YEAST
typeOf: ProteinProteinInteraction
name: "RPN8_YEAST_RPN3_YEAST"
interactingProtein: dcs:bio/UniProt_RPN8_YEAST,dcs:bio/UniProt_RPN3_YEAST
interactionDetectionMethod: dcs:TandemAffinityPurification
interactionType: dcs:PhysicalAssociation
interactionSource: dcs:Mint
intActID: "EBI-6942560"
mintID: "MINT-1984713"
imexID: "IM-15332-3714"
confidenceScore: [0.70 dcs:IntactMiScore]
pubMedID: "16554755"
imexID: "IM-15332"
mintID: "MINT-5218454"


In [239]:
# the whole schema is too large to upload to dev browser at once. Split into 3 parts.
count = 1
for i in range(0,len(schemaList), 44375):
    schemaEnumText = "\n\n".join(schemaList[i:i+44375])   
    with open('BioMINTSchema_part'+str(count)+'.mcf','w') as fp:
        fp.write(schemaEnumText)
    count += 1

In [240]:
# Show all the publications and identifier source examples.

publications = {}
identifier = {}
pCount = collections.defaultdict(int)
iCount = collections.defaultdict(int)
for line in lines:
    if len(line) == 0:
        continue
    term = line.split('\t')
    try:
        ps = term[8].split("|")
    except:
        print(term[8])
    for p in ps:
        if p.split(":")[0] not in publications:
            publications[p.split(":")[0]] = "".join(p.split(":")[1:])
        pCount[p.split(":")[0]] += 1
    ids = term[13].split("|") 
    for i in ids:
        if i.split(":")[0] not in identifier:
            identifier[i.split(":")[0]] = "".join(i.split(":")[1:])
        iCount[i.split(":")[0]] += 1


In [16]:
publications, identifier

({'pubmed': '16554755',
  'imex': 'IM-15332',
  'mint': 'MINT-5218454',
  'doi': '10.1046/j.1365-2443.2002.00589',
  'rcsb pdb': '4lep'},
 {'intact': 'EBI-6941860',
  'mint': 'MINT-1984371',
  'imex': 'IM-15332-8532',
  'emdb': 'EMD-1191',
  'wwpdb': '3blr',
  'psi-mi': '"MI0471"',
  'rcsb pdb': '1JL4',
  'reactome': 'REACT_3482.1',
  'pdbe': '4bht'})

In [18]:
publications.keys()

dict_keys(['pubmed', 'imex', 'mint', 'doi', 'rcsb pdb'])

In [19]:
identifier.keys()

dict_keys(['intact', 'mint', 'imex', 'emdb', 'wwpdb', 'psi-mi', 'rcsb pdb', 'reactome', 'pdbe'])

In [20]:
pCount

defaultdict(int,
            {'pubmed': 133167,
             'imex': 90323,
             'mint': 109775,
             'doi': 33610,
             'rcsb pdb': 9})

In [21]:
iCount

defaultdict(int,
            {'intact': 133167,
             'mint': 96364,
             'imex': 90318,
             'emdb': 3,
             'wwpdb': 33,
             'psi-mi': 4363,
             'rcsb pdb': 71,
             'reactome': 27,
             'pdbe': 1})

In [49]:
confidenceSet['author score']

[['99.00|intact-miscore', '0.37'],
 ['94.37|intact-miscore', '0.83'],
 ['99.00|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.82'],
 ['99.00|intact-miscore', '0.55'],
 ['97.54|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.57'],
 ['99.00|intact-miscore', '0.60'],
 ['99.00|intact-miscore', '0.88'],
 ['99.00|intact-miscore', '0.75'],
 ['99.00|intact-miscore', '0.55'],
 ['99.00|intact-miscore', '0.55'],
 ['98.73|intact-miscore', '0.87'],
 ['98.75|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.57'],
 ['99.00|intact-miscore', '0.67'],
 ['99.00|intact-miscore', '0.55'],
 ['99.35|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.76'],
 ['99.00|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.37'],
 ['99.25|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.37'],
 ['99.25|intact-miscore', '0.37'],
 ['99.00|intact-miscore', '0.37'],
 ['99.34|intact-miscore', '0.37'],
 ['99.34|intact-misc

In [37]:
confidenceSetCount

defaultdict(int, {'intact-miscore': 125686, 'author score': 3872})