In [1]:
import os
import xml.etree.cElementTree as ET
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

import requests

In [4]:
def download_data(url):
    r = requests.get(url)
    if r.status_code != requests.codes.ok:
        r.raise_for_status()
    return r.text

def find_species_element(species, root):
    for element in root.findall('species'):
        if element.text.strip() == species:
            return element

def update_scheme(scheme, outdir, xml_element):
    with open(outdir/(scheme + '.fsa'), 'w') as f:
        for url_element in xml_element.findall('*//locus/url'):
            data = download_data(url_element.text)
            f.write(data)
    with open(outdir/(scheme + '.tsv'), 'w') as f:
        url = xml_element.find('*//profiles/url').text
        data = download_data(url)
        f.write(data)

In [5]:
xml = '/media/GenomicResearch/Tools/CGE/dbases.xml'
tree = ET.parse(xml)
root = tree.getroot()

In [6]:
config = '/media/GenomicResearch/Tools/CGE/mlst_db/config'

with open(config) as f:
    mlst_db_config = [line.strip().split('\t')[:2] for line in f if line.startswith('#') is False]

In [8]:
outpath = Path('/media/GenomicResearch/Tools/CGE/mlst_db')

with ThreadPoolExecutor(16) as executor:
    for scheme, species in mlst_db_config:
        outdir = outpath/scheme
        speceis_element = find_species_element(species, root)
        executor.submit(update_scheme, scheme, outdir, speceis_element)