In [56]:
from xml.dom import minidom
from microbio.formats.fasta import FastaWriter

In [3]:
xmldoc = minidom.parse("../disprot/disprot_v6.02.xml")

In [18]:
prots = []

In [19]:
def get_val(parent, tagname):
    try: 
        return parent.getElementsByTagName(tagname)[0].firstChild.nodeValue
    except AttributeError:
        return None

In [20]:
for xmlprot in xmldoc.getElementsByTagName("protein"):
    xmlgeneral = xmlprot.getElementsByTagName("general")[0]
    prot = {}
    prot["name"] = get_val(xmlgeneral, "name")
    prot["regions"] = []
    for xmlregion in xmlprot.getElementsByTagName("regions")[0].getElementsByTagName("region"):
        xmldetection = xmlregion.getElementsByTagName("detection_methods")[0]
        xmlpdbs = xmlregion.getElementsByTagName("pdbs")[0]
        region = {}
        region["type"] = get_val(xmlregion, "type")
        region["seq"] = get_val(xmlregion, "sequence")
        region["detection"] = []
        region["pdbs"] = []
        for xmldetection in xmldetection.getElementsByTagName("detection"):
            region["detection"].append(get_val(xmldetection, "method"))
        for xmlpdb in xmlpdbs.getElementsByTagName("pdb"):
            region["pdbs"].append(get_val(xmlpdb, "id"))
        prot["regions"].append(region)
    prots.append(prot)


In [21]:
prots

[{'name': '60S acidic ribosomal protein P1-B',
  'regions': [{'detection': [],
    'pdbs': [],
    'seq': 'MSTEASVSYAALILADAEQEITSEKLLAITKAAGANVDQVWADVFAKAVEGKNLKELLFSFAAAAPASGAAAGSASGAAAGGEAAAEEAAEEEAAEESDDDMGFGLFD',
    'type': 'Disordered'}]},
 {'name': '60S acidic ribosomal protein P2-beta',
  'regions': [{'detection': ['Circular dichroism (CD) spectroscopy, far-UV',
     'Nuclear magnetic resonance (NMR)',
     'Analytical ultracentrifugation',
     'Hydrogen-deuterium exchange',
     'Stability at thermal extremes'],
    'pdbs': [],
    'seq': 'MKYLAAYLLLVQGGNAAPSAADIKAVVESVGAEVDEARINELLSSLEGKGSLEEIIAEGQKKFATVPTGGASSAAAGAAGAAAGGDAAEEEKEEEAKEESDDDMGFGLFD',
    'type': 'Disordered - Molten Globule'}]},
 {'name': 'Early E2A DNA-binding protein',
  'regions': [{'detection': ['X-ray crystallography'],
    'pdbs': ['1ADT', '1ADV', '1ADV', '1ANV'],
    'seq': 'KPGHAP',
    'type': 'Disordered - Extended'},
   {'detection': [],
    'pdbs': ['1ADT', '1ADV', '1ADV', '1ANV'],
    'seq': 'SN

In [82]:
disprot_selected = []
for prot in prots:
    new_prot = {k: v for k, v in prot.items()}
    new_prot["regions"] = []
    for region in prot["regions"]: 
        if (region["seq"] is not None and len(region["seq"]) >= 20 and 
            region["type"].lower().find("disordered") >= 0 and 
            #'X-ray crystallography' in region["detection"] and 
            #'Nuclear magnetic resonance (NMR)' not in region["detection"] and
            len(region["pdbs"]) == 0): 
                new_prot["regions"].append(region)
    if(len(new_prot["regions"]) > 0):
        disprot_selected.append(new_prot)

In [88]:
len(disprot_selected)

398

In [93]:
sum(len(prot["regions"]) for prot in disprot_selected)

520

In [96]:
fw = FastaWriter("../disprot_selected.fa")

In [97]:
for prot in disprot_selected:
    for region in prot["regions"]:
        fw.write_entry(prot["name"] + "; " + region["type"], region["seq"])
    

In [98]:
fw.close()