In [4]:
from common_immunogit import *

In [5]:
"""
Web Scraping Approach
"""
def create_query(domain="biomodels", offset=0, num_results=100):
    query_parts_full = {
        'mode': '*:*',
        'species': 'TAXONOMY:9606',
        'curation_status': 'curationstatus:"Manually curated"',
        'formats': 'modelformat:"SBML"',
        'kw': 'submitter_keywords:"Immuno-oncology"'
    }

    query_parts = [value for value in query_parts_full.values() if value]
    query = " AND ".join(query_parts)

    query_for_url = query.replace(" ", "%20").replace(":", "%3A").replace('"', "%22")
    url = f"https://www.ebi.ac.uk/biomodels/search?query={query_for_url}&domain={domain}&offset={offset}&numResults={num_results}"

    return query, url

In [6]:
def extract_ids(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")

        pattern = r"(BIOMD\d{10}|MODEL\d{10})"
        ids = set()

        for element in soup.find_all(string=True):
            content = element.strip()
            matches = re.findall(pattern, content)
            ids.update(matches)

        ids = sorted(ids)
        return ids
    except Exception as e:
        print(f"Error : {e}")
        return []

nest_asyncio.apply()

async def scrape_page(offset=0, num_results=100):
    query, url = create_query(domain="biomodels", offset=offset, num_results=num_results)
    output_file = "webpage_source.html"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_load_state("networkidle")
        page_source = await page.content()

        with open(output_file, "w", encoding="utf-8") as file:
            file.write(page_source)

        await browser.close()

    ids = extract_ids(output_file)

    if os.path.exists(output_file):
        os.remove(output_file)

    return ids

async def main():
    offset = 0
    num_results = 100
    ids_agreg = []

    while True:
        ids = await scrape_page(offset, num_results)

        if not ids:
            break

        ids_agreg.extend(ids)

        if len(ids) < num_results:
            break

        offset += num_results

    ids_agreg_fp = os.path.join(root_path, "tmp", "ids_agreg.json")

    with open(ids_agreg_fp, "w") as f:
        json.dump(ids_agreg, f)

    print(f"Collected IDs : {ids_agreg}")
    print(f"Number of Collected IDs : {len(ids_agreg)}")

    return ids_agreg

collected_ids = asyncio.run(main())

Collected IDs : ['BIOMD0000000741', 'BIOMD0000000742', 'BIOMD0000000743', 'BIOMD0000000744', 'BIOMD0000000746', 'BIOMD0000000748', 'BIOMD0000000749', 'BIOMD0000000751', 'BIOMD0000000752', 'BIOMD0000000753', 'BIOMD0000000754', 'BIOMD0000000756', 'BIOMD0000000757', 'BIOMD0000000758', 'BIOMD0000000759', 'BIOMD0000000760', 'BIOMD0000000764', 'BIOMD0000000766', 'BIOMD0000000767', 'BIOMD0000000768', 'BIOMD0000000769', 'BIOMD0000000770', 'BIOMD0000000778', 'BIOMD0000000780', 'BIOMD0000000781', 'BIOMD0000000782', 'BIOMD0000000791', 'BIOMD0000000798', 'BIOMD0000000801', 'BIOMD0000000802', 'BIOMD0000000812', 'BIOMD0000000813', 'BIOMD0000000877', 'BIOMD0000000879', 'BIOMD0000000880', 'BIOMD0000000885', 'BIOMD0000000886', 'BIOMD0000000888', 'BIOMD0000000891', 'BIOMD0000000894', 'BIOMD0000000900', 'BIOMD0000000904', 'BIOMD0000000908', 'BIOMD0000000909', 'BIOMD0000000910', 'BIOMD0000000911', 'BIOMD0000000912', 'BIOMD0000000913', 'BIOMD0000000919', 'BIOMD0000000921', 'BIOMD0000000926', 'BIOMD00000010

In [7]:
"""
BioServices Approach
"""

s = BioModels()

def get_filtered_models(query: str) -> list:
    offset = 0
    num_results = 10
    all_models = []

    try:
        while True:
            search_results = s.search(query, numResults=num_results, offset=offset)

            if search_results.get("models"):
                models = search_results["models"]
                all_models.extend(models)

                offset += num_results
            else:
                break

        if all_models:
            print(f"\nTotal models : {len(all_models)}")
        else:
            print("No matching models.")

    except Exception as e:
        raise RuntimeError(f"Error : {str(e)}")

    return [model['id'] for model in all_models]

def get_model_metadata(model_ids: list) -> dict:
    metadata = {}
    for model_id in model_ids:
        try:
            model_data = s.get_model(model_id)
            metadata[model_id] = model_data
        except Exception as e:
            print(f"Error on {model_id}: {e}")

    return metadata

def save_metadata_to_json(metadata: dict, filename: str):
    try:
        with open(filename, 'w', encoding='utf-8') as json_file:
            json.dump(metadata, json_file, ensure_ascii=False, indent=4)
        print(f"Metadata saved to {filename}")
    except Exception as e:
        print(f"Error : {e}")

def download_biomodels(directory: str, model_ids: list, num_per_download=100):
    if num_per_download > 100:
        raise ValueError("Maximum number of models that can be downloaded at a time is 100.")

    total_models = len(model_ids)
    if total_models == 0:
        raise ValueError("Error : model_ids list empty.")

    num_downloads = (total_models // num_per_download) + (1 if total_models % num_per_download > 0 else 0)
    filenames = []

    for download_number in range(num_downloads):
        start = download_number * num_per_download
        end = min(start + num_per_download, total_models)
        batch = model_ids[start:end]

        print(f"Downloading batch {download_number + 1}: Models {start + 1} to {end}")

        fname = os.path.join(directory, f"Biomodels_{start + 1}_to_{end}.zip")
        filenames.append(fname)

        if os.path.isfile(fname):
            os.remove(fname)

        try:
            s.search_download(batch, output_filename=fname)
            print(f"Downloaded models {start + 1} to {end} into {fname}")
        except Exception as e:
            print(f"Error downloading batch {download_number + 1}: {str(e)}")

    final_zip = os.path.join(directory, "biomodels_filtered.zip")
    with z.ZipFile(filenames[0], 'a') as z1:
        for fname in filenames[1:]:
            with z.ZipFile(fname, 'r') as zf:
                for n in zf.namelist():
                    z1.writestr(n, zf.read(n))

    if not os.path.isfile(final_zip):
        os.rename(filenames[0], final_zip)

    for fname in filenames[1:]:
        try:
            os.remove(fname)
        except Exception:
            print(f"Could not delete temporary file: {fname}")

    print(f"All models consolidated into {final_zip}")
    return final_zip

def bioservices_get_models():
    query, _ = create_query()
    try:
        filtered_model_ids = get_filtered_models(query)
        model_metadata = get_model_metadata(filtered_model_ids)
        save_metadata_to_json(model_metadata, md_path / "model_metadata.json")
        output_zip = download_biomodels(
            directory=bm_sbml_path,
            model_ids=filtered_model_ids,
            num_per_download=100
        )

        print(f"Models downloaded and saved in {output_zip}")

    except Exception as e:
        print(f"Error : {e}")


[32mINFO    [bioservices.BioModels:363]: [0m [32mInitialising BioModels service (REST)[0m


In [8]:
bioservices_get_models()


Total models : 68
Metadata saved to /Users/guillaume.souede/PycharmProjects/immunogit/metadata/model_metadata.json
Downloading batch 1: Models 1 to 68


[32mINFO    [bioservices.BioModels:240]: [0m [32m/Users/guillaume.souede/PycharmProjects/immunogit/models/BioModels/SBML/Biomodels_1_to_68.zip[0m


Downloaded models 1 to 68 into /Users/guillaume.souede/PycharmProjects/immunogit/models/BioModels/SBML/Biomodels_1_to_68.zip
All models consolidated into /Users/guillaume.souede/PycharmProjects/immunogit/models/BioModels/SBML/biomodels_filtered.zip
Models downloaded and saved in /Users/guillaume.souede/PycharmProjects/immunogit/models/BioModels/SBML/biomodels_filtered.zip


In [9]:
print(md_path)

/Users/guillaume.souede/PycharmProjects/immunogit/metadata
