# JSONMerger

In [1]:
from common_immunogit import *

2025-03-31 00:52:40,371 - INFO - Root path: /Users/guillaume.souede/PycharmProjects/immunogit
2025-03-31 00:52:40,372 - INFO - Directory structure set up successfully.


In [None]:
def json_metadata_extractor_full(json_data):
    """Extract metadata from a JSON structure.

    Args:
        json_data (dict): The JSON data. Contains the model metadata.

    Returns:
        dict: A dictionary containing extracted metadata for each model.
    """
    extracted_data = {}

    for model_id, model_data in json_data.items():
        extracted_data[model_id] = {
            "name": model_data.get("name"),
            "description": model_data.get("description"),
            "format": model_data.get("format", {}).get("name"),
            "publication": {
                "title": model_data.get("publication", {}).get("title"),
                "journal": model_data.get("publication", {}).get("journal"),
                "year": model_data.get("publication", {}).get("year"),
                "authors": [
                    {
                        "name": author.get("name"),
                        "institution": author.get("institution"),
                        "orcid": author.get("orcid")
                    }
                    for author in model_data.get("publication", {}).get("authors", [])
                ],
            },
            "files": model_data.get("files", {}).get("main", []),
            "contributors": model_data.get("contributors", {}),
            "tags": model_data.get("modelTags", [])
        }

    return extracted_data


def process_multiple_json_files(input_folder, delete=False):
    """Process multiple JSON files and extract metadata.

    Args:
        input_folder (str): Path to the folder containing JSON files.
        delete (bool): Whether to delete the source JSON files after processing.

    Returns:
        None
    """
    models_file = os.path.join(md_path, "models_metadata.json")
    biomodels_file = os.path.join(md_path, "metadata_BioModels.json")
    reactome_file = os.path.join(md_path, "metadata_Reactome.json")

    merged_data = {"models": {}}
    biomodels_data = {}
    reactome_data = {}

    for filename in os.listdir(input_folder):
        if filename.endswith(".json") and not filename.startswith("."):  # Skip hidden files
            file_path = os.path.join(input_folder, filename)
            with open(file_path, "r") as file:
                json_data = json.load(file)
                metadata = json_metadata_extractor(json_data)

                merged_data["models"].append(metadata)

                if "BioModels" in filename:
                    biomodels_data.append(metadata)
                elif "Reactome" in filename:
                    reactome_data.append(metadata)

    # merged model
    with open(models_file, "w") as outfile:
        json.dump(merged_data, outfile, indent=4)
    print(f"Created {models_file}")

    # BioModels
    with open(biomodels_file, "w") as outfile:
        json.dump({"BioModels": biomodels_data}, outfile, indent=4)
    print(f"Created {biomodels_file}")

    # Reactome
    with open(reactome_file, "w") as outfile:
        json.dump({"Reactome": reactome_data}, outfile, indent=4)
    print(f"Created {reactome_file}")

    # Delete source files if True
    if delete:
        for filename in os.listdir(input_folder):
            file_path = os.path.join(input_folder, filename)
            if filename.endswith(".json") and not filename.startswith("."):
                os.remove(file_path)
        print(f"Source files deleted from {input_folder}.")
    else:
        print(f"{BLUE}Source files not deleted. Set delete = True to remove them.{RESET}")


def shrink_json_info(input_path):
    """Filter and save metadata with tags from a JSON file.

    Args:
        input_path (Path): Path to the input JSON file.

    Returns:
        None
    """
    with open(input_path, 'r') as f:
        json_data = json.load(f)
    filtered_data = json_metadata_extractor(json_data)
    output_path = input_path.with_name(input_path.stem + "_filtered.json")
    with open(output_path, 'w') as f:
        json.dump(filtered_data, f, indent=4)
    print(f"Filtered metadata saved as : {output_path}")

We manually curated keywords, and can import the dictionary file :

In [3]:
with open("../tmp/keywords.json", "r", encoding="utf-8") as f:
    keywords_dict = json.load(f)
keywords_dict

{'Cells': ['T helper cells',
  'B cells',
  'Cytotoxic T cells',
  'Plasma cells',
  'Tumor cells',
  'Avascular cancerous cells',
  'Cancer cells',
  'CTL',
  'Infected tumor cells',
  'Uninfected tumor cells',
  'Effector cells',
  'Immune system cells',
  'Tumor-specific CTL',
  'Virus-specific CTL',
  'Immune killer cells',
  'Tumor-infiltrating lymphocytes'],
 'Treatments': ['Immunotherapy',
  'Monoclonal antibody therapy',
  'Radiovirotherapy',
  'Virotherapy',
  'Radiotherapy',
  'Cancer therapy',
  'Chemotherapy',
  'Optimal chemotherapy',
  'Strongly targeted agent',
  'Cancer immunotherapy',
  'Immune checkpoint inhibitors',
  'Adoptive cell therapy'],
 'Interactions': ['Cancer-immune system interaction',
  'Immune cell killing',
  'Tumor remission',
  'Tumor growth',
  'Virus replication',
  'Virus infection',
  'Immune suppression',
  'Immunostimulatory signals',
  'Bistability',
  'Antibody-mediated killing',
  'Optimal control',
  'Tumor immune-system interactions',
  'Im

Attempt :

In [None]:
def json_metadata_extractor(json_data):
    """Extract metadata from a JSON structure and generate tags based on keywords.

    Args:
        json_data (dict): The JSON data. Contains the model metadata.

    Returns:
        dict: A dictionary containing extracted metadata for each model.
    """
    extracted_data = {}

    for model_id, model_data in json_data.items():
        tags = model_data.get("modelTags", [])
        new_tags = []

        for category, keywords in keywords_dict.items():
            found_keywords = [
                keyword for keyword in keywords 
                if keyword.lower() in model_data.get("name", "").lower() or 
                   keyword.lower() in model_data.get("description", "").lower()
            ]
            if found_keywords:
                new_tags.append(f"{category} : " + ', '.join(found_keywords))

        global_tag = [f"Main: {tag}" for tag in tags]

        extracted_data[model_id] = {
            "name": model_data.get("name"),
            "description": model_data.get("description"),
            "format": model_data.get("format", {}).get("name"),
            "publication": {
                "title": model_data.get("publication", {}).get("title"),
                "journal": model_data.get("publication", {}).get("journal"),
                "year": model_data.get("publication", {}).get("year"),
                "authors": [
                    {
                        "name": unicode_fixer(author.get("name", "")),
                        "institution": author.get("institution"),
                        "orcid": author.get("orcid")
                    }
                    for author in model_data.get("publication", {}).get("authors", [])
                ],
            },
            "files": model_data.get("files", {}).get("main", []),
            "contributors": model_data.get("contributors", {}),
            "tags": list(set(global_tag + new_tags))
        }

    return extracted_data

def process_multiple_json_files(input_folder, delete=False):
    """Process multiple JSON files and extract metadata.

    Args:
        input_folder (str): Path to the folder containing JSON files.
        delete (bool): Whether to delete the source JSON files after processing.

    Returns:
        None
    """
    pass

def shrink_json_tags(input_path):
    """Filter and save metadata with tags from a JSON file.

    Args:
        input_path (Path): Path to the input JSON file.

    Returns:
        None
    """
    with open(input_path, 'r') as f:
        json_data = json.load(f)
    filtered_data = json_metadata_extractor(json_data)
    output_path = input_path.with_name(input_path.stem + "_tags.json")
    with open(output_path, 'w') as f:
        json.dump(filtered_data, f, indent=4)
    print(f"Filtered metadata saved as : {output_path}")

In [None]:
def unicode_fixer(phrase):
    """
    Fixes encoding issues in a given string by attempting multiple decoding strategies.

    Args:
        phrase (str): The input string to fix.

    Returns:
        str: The fixed string with encoding issues resolved.
    """
    if isinstance(phrase, str):
        try:
            # Decode using 'utf-8' with replacements
            return phrase.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            try:
                # Re-encode back to 'utf-8' after error fix
                return phrase.encode('latin1').decode('utf-8', errors='replace')
            except Exception as e:
                print(f"Error fixing unicode: {e}")
                return phrase
    return phrase

In [15]:
infile = md_path / "model_metadata.json"
shrink_json_tags(infile)

In [16]:
print(unicode_fixer("Heinz Sch\u00c3\u00a4ttler"))

Heinz Schättler
