In [None]:
"""
MODULES
"""
import os
import subprocess
import sys
import zipfile as z
import requests
import json
import re
from bs4 import BeautifulSoup
"""
PATHS
"""
current_path = os.getcwd()
# BELOW PATH IS IMPORTANT AND TO RE-USE !
root_path = os.path.abspath(os.path.join(current_path, ".."))

os.makedirs(os.path.join(root_path, "models/BioModels/SBML"), exist_ok=True)
os.makedirs(os.path.join(root_path, "models/Reactome/SBML"), exist_ok=True)
os.makedirs(os.path.join(root_path, "models/Reactome/SBGN"), exist_ok=True)
os.makedirs(os.path.join(root_path, "metadata"), exist_ok=True)
print("Structure : Done.")

"""
COMMON VARIABLES
"""
BLUE = '\033[94m'
RED = '\033[91m'
RESET = '\033[0m'

In [None]:
import os
import json

def json_metadata_extractor(json_data, source=None):
    """Extract some ir(relevant) metadata from a given JSON file."""
    return {
        "name": json_data.get("name"),
        "description": json_data.get("description"),
        "format": json_data.get("format", {}).get("name"),
        "publication": {
            "title": json_data.get("publication", {}).get("title"),
            "journal": json_data.get("publication", {}).get("journal"),
            "year": json_data.get("publication", {}).get("year"),
            "authors": [
                {
                    "name": author.get("name"),
                    "institution": author.get("institution"),
                    "orcid": author.get("orcid")
                }
                for author in json_data.get("publication", {}).get("authors", [])
            ],
            "link": json_data.get("publication", {}).get("link")
        },
        "files": [
            {
                "name": file_data.get("name"),
                "description": file_data.get("description"),
                "fileSize": file_data.get("fileSize"),
                "mimeType": file_data.get("mimeType"),
            }
            for file_data in json_data.get("files", {}).get("main", []) + json_data.get("files", {}).get("additional", [])
        ],
        "contributors": {
            "curators": [
                {
                    "name": curator.get("name"),
                    "email": curator.get("email"),
                    "orcid": curator.get("orcid")
                }
                for curator in json_data.get("contributors", {}).get("curator", [])
            ],
            "modellers": [
                {
                    "name": modeller.get("name"),
                    "email": modeller.get("email"),
                    "orcid": modeller.get("orcid")
                }
                for modeller in json_data.get("contributors", {}).get("modeller", [])
            ]
        },
        "annotations": [
            {
                "qualifier": annotation.get("qualifier"),
                "accession": annotation.get("accession"),
                "name": annotation.get("name"),
                "resource": annotation.get("resource"),
                "uri": annotation.get("uri")
            }
            for annotation in json_data.get("modelLevelAnnotations", [])
        ],
        "source": source
    }

def process_multiple_json_files(input_folder, output_file, source=None, delete=False):
    """
    Process multiple JSON files, merge metadata, and create separate archives based on format (SBML or SBGN).
    """
    merged_data = {"SBML": [], "SBGN": []}

    for filename in os.listdir(input_folder):
        if filename.endswith(".json") and filename != os.path.basename(output_file):
            file_path = os.path.join(input_folder, filename)
            with open(file_path, "r") as file:
                json_data = json.load(file)
                metadata = json_metadata_extractor(json_data, source)
                format_type = metadata.get("format", "")

                if source == "Reactome":
                    if format_type == "SBML":
                        merged_data["SBML"].append(metadata)
                    elif format_type == "SBGN":
                        merged_data["SBGN"].append(metadata)
                else:
                    metadata_source = metadata.get("source", "Unknown")
                    if metadata_source not in merged_data:
                        merged_data[metadata_source] = []
                    merged_data[metadata_source].append(metadata)

    if source == "Reactome":
        for format_type in ["SBML", "SBGN"]:
            output_subfile = f"{output_file}_{format_type}.json"
            with open(output_subfile, "w") as outfile:
                json.dump(merged_data[format_type], outfile, indent=4)
            print(f"{BLUE}File created for {format_type} in {output_subfile}.{RESET}")
    else:
        with open(output_file, "w") as outfile:
            json.dump(merged_data, outfile, indent=4)
        print(f"{BLUE}File created in {output_file}.{RESET}")


    if delete:
        for filename in os.listdir(input_folder):
            file_path = os.path.join(input_folder, filename)
            if filename.endswith(".json") and filename != os.path.basename(output_file):
                os.remove(file_path)
        print(f"{BLUE}Source files deleted from {input_folder}, except {os.path.basename(output_file)}.{RESET}")
    else:
        print(f"{BLUE}Source files not deleted. Set delete=True to remove them.{RESET}")


In [None]:
"""
1) Appears to work properly.
2) First nested dictionary key appears to be source (BioModels). Needs to be checked because may be redundant with Source.
"""
input_folder = os.path.join(root_path, "models/BioModels/SBML/")
output_file = os.path.join(root_path, "models/BioModels/SBML/merged_metadata.json")
custom_source = "BioModels"
process_multiple_json_files(input_folder, output_file, source=custom_source, delete=True)

Enhanced version, works with files repartition BioModels/Reactome and SBML/SBGN :

In [10]:
BLUE = '\033[94m'
RED = '\033[91m'
RESET = '\033[0m'

In [26]:
import os
import json

def json_metadata_extractor(json_data):
    """Extract some relevant metadata from a given JSON file."""
    return {
        "name": json_data.get("name"),
        "description": json_data.get("description"),
        "format": json_data.get("format", {}).get("name"),
        "publication": {
            "title": json_data.get("publication", {}).get("title"),
            "journal": json_data.get("publication", {}).get("journal"),
            "year": json_data.get("publication", {}).get("year"),
            "authors": [
                {
                    "name": author.get("name"),
                    "institution": author.get("institution"),
                    "orcid": author.get("orcid")
                }
                for author in json_data.get("publication", {}).get("authors", [])
            ],
            "link": json_data.get("publication", {}).get("link")
        },
        "files": [
            {
                "name": file_data.get("name"),
                "description": file_data.get("description"),
                "fileSize": file_data.get("fileSize"),
                "mimeType": file_data.get("mimeType"),
            }
            for file_data in json_data.get("files", {}).get("main", []) + json_data.get("files", {}).get("additional", [])
        ],
        "contributors": {
            "curators": [
                {
                    "name": curator.get("name"),
                    "email": curator.get("email"),
                    "orcid": curator.get("orcid")
                }
                for curator in json_data.get("contributors", {}).get("curator", [])
            ],
            "modellers": [
                {
                    "name": modeller.get("name"),
                    "email": modeller.get("email"),
                    "orcid": modeller.get("orcid")
                }
                for modeller in json_data.get("contributors", {}).get("modeller", [])
            ]
        },
        "annotations": [
            {
                "qualifier": annotation.get("qualifier"),
                "accession": annotation.get("accession"),
                "name": annotation.get("name"),
                "resource": annotation.get("resource"),
                "uri": annotation.get("uri")
            }
            for annotation in json_data.get("modelLevelAnnotations", [])
        ]
    }

def process_multiple_json_files(input_folder, delete=False):
    """Process JSON files and create separate metadata files for models."""

    metadata_folder = "../metadata"
    os.makedirs(metadata_folder, exist_ok=True)
    models_file = os.path.join(metadata_folder, "models_metadata.json")
    biomodels_file = os.path.join(metadata_folder, "metadata_BioModels.json")
    reactome_file = os.path.join(metadata_folder, "metadata_Reactome.json")

    merged_data = {"models": {}}
    biomodels_data = {}
    reactome_data = {}

    for filename in os.listdir(input_folder):
        if filename.endswith(".json") and not filename.startswith("."):  # Skip hidden files
            file_path = os.path.join(input_folder, filename)
            with open(file_path, "r") as file:
                json_data = json.load(file)
                metadata = json_metadata_extractor(json_data)

                merged_data["models"].append(metadata)

                if "BioModels" in filename:
                    biomodels_data.append(metadata)
                elif "Reactome" in filename:
                    reactome_data.append(metadata)

    # merged model
    with open(models_file, "w") as outfile:
        json.dump(merged_data, outfile, indent=4)
    print(f"Created {models_file}")

    # BioModels
    with open(biomodels_file, "w") as outfile:
        json.dump({"BioModels": biomodels_data}, outfile, indent=4)
    print(f"Created {biomodels_file}")

    # Reactome
    with open(reactome_file, "w") as outfile:
        json.dump({"Reactome": reactome_data}, outfile, indent=4)
    print(f"Created {reactome_file}")

    # Delete source files if True
    if delete:
        for filename in os.listdir(input_folder):
            file_path = os.path.join(input_folder, filename)
            if filename.endswith(".json") and not filename.startswith("."):
                os.remove(file_path)
        print(f"Source files deleted from {input_folder}.")
    else:
        print(f"{BLUE}Source files not deleted. Set delete = True to remove them.{RESET}")

In [28]:
process_multiple_json_files("../metadata/Raw", delete=False)

Created ../metadata/models_metadata.json
Created ../metadata/metadata_BioModels.json
Created ../metadata/metadata_Reactome.json
[94mSource files not deleted. Set delete = True to remove them.[0m
