File for testing purposes only.

In [1]:
import os
import subprocess
import sys
import zipfile as z
import requests
import json

In [2]:
try:
    from bioservices import BioModels
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "bioservices"])
    from bioservices import BioModels

s = BioModels()

[32mINFO    [bioservices.BioModels:363]: [0m [32mInitialising BioModels service (REST)[0m


Creating directory /Users/guillaume.souede/Library/Caches/bioservices 


In [7]:
current_path = os.getcwd()
root_path = os.path.abspath(os.path.join(current_path, ".."))

os.makedirs(os.path.join(root_path, "models/BioModels/SBML"), exist_ok=True)
os.makedirs(os.path.join(root_path, "models/Reactome/SBML"), exist_ok=True)
os.makedirs(os.path.join(root_path, "models/Reactome/SBGN"), exist_ok=True)
os.makedirs(os.path.join(root_path, "metadata"), exist_ok=True)
print("Structure : Done.")

Structure : Done.


In [2]:
def create_query() -> str:
    query_parts_full = {
        'mode': '*:*',
        'species': 'TAXONOMY:9606',
        'curation_status': 'curationstatus:"Manually curated"',
        'formats': 'modelformat:"SBML"',
        'kw': 'submitter_keywords:"Immuno-oncology"'
    }

    query_parts = [value for value in query_parts_full.values() if value]
    query = " AND ".join(query_parts)

    query_parts_simple = {
        key: value.split(":")[1] if ":" in value else value
        for key, value in query_parts_full.items()
    }

    query_simple = " AND ".join(query_parts_simple.values())

    return query, query_simple

q, sq = create_query()
print(q)
print(sq)

*:* AND TAXONOMY:9606 AND curationstatus:"Manually curated" AND modelformat:"SBML" AND submitter_keywords:"Immuno-oncology"
* AND 9606 AND "Manually curated" AND "SBML" AND "Immuno-oncology"


Final :

In [3]:
def get_ids() -> list:
    query, sq = create_query()

    print(f"Generated query: {query}")

    try:
        search_results = s.search(query)

        if not search_results or 'models' not in search_results or not search_results['models']:
            print("No models found.")
            return []

        model_ids = [model['id'] for model in search_results['models']]
        return model_ids

    except Exception as e:
        print(f"Error during search: {e}")
        return []

In [4]:
def download_biomodels(directory: str, model_ids: list):
    filenames = []
    num_per_download = 100

    if not os.path.exists(directory):
        os.makedirs(directory)

    num_downloads = (len(model_ids) + num_per_download - 1) // num_per_download

    start = 0
    for download_number in range(1, num_downloads + 1):
        end = min(start + num_per_download, len(model_ids))
        fname = os.path.join(directory, f"Biomodels_{start+1}-{end}.zip")
        filenames.append(fname)

        if os.path.isfile(fname):
            os.remove(fname)

        current_batch_ids = model_ids[start:end]

        try:
            s.search_download(current_batch_ids, output_filename=fname)
            print(f"Biomodels models from ID {start+1} to {end} saved to {fname}")
        except Exception as e:
            print(f"Error downloading batch {start+1}-{end}: {e}")
            continue

        start = end

    # Consolidation
    consolidated_zip = os.path.join(directory, "biomodels.zip")
    with z.ZipFile(consolidated_zip, 'w') as zf_out:
        for fname in filenames:
            try:
                with z.ZipFile(fname, 'r') as zf_in:
                    for file_name in zf_in.namelist():
                        zf_out.writestr(file_name, zf_in.read(file_name))
            except Exception as e:
                print(f"Error consolidating {fname}: {e}")

    for fname in filenames:
        if os.path.isfile(fname):
            os.remove(fname)

    print(f"All models consolidated into {consolidated_zip}")
    return consolidated_zip

Using the code to answer :

In [20]:
ACTIVATE_DOWNLOAD = True
query, sq = create_query()
model_ids = get_ids()
print(f"Model IDs: {model_ids}")
print(f"Found {len(model_ids)} models matching the query.")
if ACTIVATE_DOWNLOAD and model_ids:
    download_biomodels(modelsDir, model_ids)

NameError: name 'create_query' is not defined

Get metadata............ From .json file from BioModels

In [21]:
import json

fic = "/Users/guillaume.souede/PycharmProjects/immunogit/models/BioModels/SBML/BIOMD0000000955.json"
with open(fic, "r") as file:
    data = json.load(file)

info = {
    "Model Name": data.get("name"),
    "Description": data.get("description"),
    "Format": data.get("format", {}).get("name"),
    "Publication": {
        "Title": data.get("publication", {}).get("title"),
        "Journal": data.get("publication", {}).get("journal"),
        "Year": data.get("publication", {}).get("year"),
        "Authors": [
            {
                "Name": author.get("name"),
                "Institution": author.get("institution"),
                "ORCID": author.get("orcid")
            }
            for author in data.get("publication", {}).get("authors", [])
        ],
        "Link": data.get("publication", {}).get("link")
    },
    "Files": [
        {
            "File Name": file_data.get("name"),
            "Description": file_data.get("description"),
            "File Size (Bytes)": file_data.get("fileSize"),
            "MIME Type": file_data.get("mimeType"),
        }
        for file_data in data.get("files", {}).get("main", []) + data.get("files", {}).get("additional", [])
    ],
    "Curation Status": data.get("curationStatus"),
    "Contributors": {
        "Curators": [
            {
                "Name": curator.get("name"),
                "Email": curator.get("email"),
                "ORCID": curator.get("orcid")
            }
            for curator in data.get("contributors", {}).get("Curator", [])
        ],
        "Modellers": [
            {
                "Name": modeller.get("name"),
                "Email": modeller.get("email"),
                "ORCID": modeller.get("orcid")
            }
            for modeller in data.get("contributors", {}).get("Modeller", [])
        ]
    },
    "Model Tags": data.get("modelTags"),
    "Annotations": [
        {
            "Qualifier": annotation.get("qualifier"),
            "Accession": annotation.get("accession"),
            "Name": annotation.get("name"),
            "Resource": annotation.get("resource"),
            "URI": annotation.get("uri")
        }
        for annotation in data.get("modelLevelAnnotations", [])
    ]
}

print(json.dumps(info, indent=4))

{
    "Model Name": "Giordano2020 - SIDARTHE model of COVID-19 spread in Italy",
    "Description": "In Italy, 128,948 confirmed cases and 15,887 deaths of people who tested positive for SARS-CoV-2 were registered as of 5 April 2020. Ending the global SARS-CoV-2 pandemic requires implementation of multiple population-wide strategies, including social distancing, testing and contact tracing. We propose a new model that predicts the course of the epidemic to help plan an effective control strategy. The model considers eight stages of infection: susceptible (S), infected (I), diagnosed (D), ailing (A), recognized (R), threatened (T), healed (H) and extinct (E), collectively termed SIDARTHE. Our SIDARTHE model discriminates between infected individuals depending on whether they have been diagnosed and on the severity of their symptoms. The distinction between diagnosed and non-diagnosed individuals is important because the former are typically isolated and hence less likely to spread the i

In [34]:
def json_metadata_extractor(json_data):
    """ Get relevant metadata(s) from a given JSON file. """
    return {
        "Model Name": json_data.get("name"),
        "Description": json_data.get("description"),
        "Format": json_data.get("format", {}).get("name"),
        "Publication": {
            "Title": json_data.get("publication", {}).get("title"),
            "Journal": json_data.get("publication", {}).get("journal"),
            "Year": json_data.get("publication", {}).get("year"),
            "Authors": [
                {
                    "Name": author.get("name"),
                    "Institution": author.get("institution"),
                    "ORCID": author.get("orcid")
                }
                for author in json_data.get("publication", {}).get("authors", [])
            ],
            "Link": json_data.get("publication", {}).get("link")
        },
        "Files": [
            {
                "File Name": file_data.get("name"),
                "Description": file_data.get("description"),
                "File Size (Bytes)": file_data.get("fileSize"),
                "MIME Type": file_data.get("mimeType"),
            }
            for file_data in json_data.get("files", {}).get("main", []) + json_data.get("files", {}).get("additional", [])
        ],
        "Curation Status": json_data.get("curationStatus"),
        "Contributors": {
            "Curators": [
                {
                    "Name": curator.get("name"),
                    "Email": curator.get("email"),
                    "ORCID": curator.get("orcid")
                }
                for curator in json_data.get("contributors", {}).get("Curator", [])
            ],
            "Modellers": [
                {
                    "Name": modeller.get("name"),
                    "Email": modeller.get("email"),
                    "ORCID": modeller.get("orcid")
                }
                for modeller in json_data.get("contributors", {}).get("Modeller", [])
            ]
        },
        "Model Tags": json_data.get("modelTags"),
        "Annotations": [
            {
                "Qualifier": annotation.get("qualifier"),
                "Accession": annotation.get("accession"),
                "Name": annotation.get("name"),
                "Resource": annotation.get("resource"),
                "URI": annotation.get("uri")
            }
            for annotation in json_data.get("modelLevelAnnotations", [])
        ]
    }

def process_multiple_json_files(input_folder, output_file):
    """ Read the processed files and merge them ! """
    merged_data = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):
            file_path = os.path.join(input_folder, filename)
            with open(file_path, "r") as file:
                json_data = json.load(file)
                metadata = json_metadata_extractor(json_data)
                merged_data.append(metadata)

    with open(output_file, "w") as outfile:
        json.dump(merged_data, outfile, indent=4)


AttributeError: 'list' object has no attribute 'get'

In [None]:
"""
NEED :
1) Get rid of parasite [ ] on start/end of generated .json
2) Fix to work with a list of files !
3) After, delete previous unmerged metadata.
4) A way to split per source of metadata i.e. BioModels or others.......
"""
input_folder = "/Users/guillaume.souede/PycharmProjects/immunogit/models/BioModels/SBML/"
output_file = "/Users/guillaume.souede/PycharmProjects/immunogit/models/BioModels/SBML/merged_metadata.json"
process_multiple_json_files(input_folder, output_file)