# Report for primary knowledge sources in the Matrix KG

This notebook is simulating the Kedro pipeline during development and should be deleted before merging the PR.

In [33]:
import pandas as pd
import yaml
import json
from typing import Dict, Any
from pathlib import Path

# Utilities for reading and writing

In [34]:
import requests


def save_yaml_file(data: Dict, url_path: str) -> None:
    """Save data to YAML file."""
    with open(url_path, 'w') as f:
        yaml.dump(data, f, default_flow_style=False)

def save_markdown_file(content: str, url_path: str) -> None:
    """Save content to markdown file."""
    with open(url_path, 'w') as f:
        f.write(content)

def load_yaml_file(url_path: str) -> Dict[str, Any]:
    """Load YAML file from local path or URL and return parsed data."""
    if str(url_path).startswith("http"):
        response = requests.get(url_path)
        response.raise_for_status()
        return yaml.safe_load(response.text)
    else:
        with open(url_path, 'r') as f:
            return yaml.safe_load(f)

def load_json_file(url_path: str) -> Dict[str, Any]:
    """Load JSON file from local path or URL and return parsed data."""
    if str(url_path).startswith("http"):
        response = requests.get(url_path)
        response.raise_for_status()
        return json.loads(response.text)
    else:
        with open(url_path, 'r') as f:
            return json.load(f)


# Set up file paths to simulate the pipeline outside the kedro environment

In [None]:
# Outputs
pks_md_file = Path("primary-knowledge-sources.md")
pks_yaml_file = Path("primary-knowledge-sources.yml")

infores_url = "https://raw.githubusercontent.com/biolink/information-resource-registry/refs/heads/main/infores_catalog.yaml"
reusabledata_url = "https://raw.githubusercontent.com/reusabledata/reusabledata/refs/heads/master/data.json"
kgregistry_url = "https://raw.githubusercontent.com/Knowledge-Graph-Hub/kg-registry/refs/heads/main/registry/kgs.yml"

# Matrix curated information
matrixcurated_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQxpQU80dpW9bo7STfrX7k9Wv70jA_2C4BN6tDceM1LEOfF9YL22OisdmaUPf7Ptw/pub?gid=135786799&single=true&output=tsv"
matrixreviews_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQxpQU80dpW9bo7STfrX7k9Wv70jA_2C4BN6tDceM1LEOfF9YL22OisdmaUPf7Ptw/pub?gid=1308154629&single=true&output=tsv"

# Mappings
reusabledata_infores_mapping_url = "https://raw.githubusercontent.com/matentzn/pks-resource/refs/heads/main/mappings/reusabledata-infores.sssom.tsv"
kgregistry_infores_mapping_url = "https://raw.githubusercontent.com/matentzn/pks-resource/refs/heads/main/mappings/kgregistry-infores.sssom.tsv"

# Load infores_catalog.yml (YAML)
infores_d = load_yaml_file(infores_url)

# Load reusabledata.json (JSON)
reusabledata_d = load_json_file(reusabledata_url)

# Mapping: reusabledata -> infores
reusabledata_mapping_d = pd.read_csv(reusabledata_infores_mapping_url, sep='\t', comment="#")

# Load kgregistry.yml (YAML)
kgregistry_d = load_yaml_file(kgregistry_url)
kgregistry_mapping_d = pd.read_csv(kgregistry_infores_mapping_url, sep="\t", comment="#")

# Load manually curated data (TSV)
matrixcurated_d = pd.read_csv(matrixcurated_url, sep="\t", dtype=str)
matrixreviews_d = pd.read_csv(matrixreviews_url, sep="\t", dtype=str)

pks_integrated = matrixcurated_d[['primary_knowledge_source']]


# Run the full pipeline to generate the YAML and documentation files

In [None]:
from nodes import (
    create_pks_integrated_metadata
)

matrix_subset_relevant_sources = create_pks_integrated_metadata(
    infores=infores_d,
    reusabledata=reusabledata_d,
    kgregistry=kgregistry_d,
    matrix_curated=matrixcurated_d,
    matrix_reviews=matrixreviews_d,
    pks_integrated=pks_integrated,
    mapping_reusabledata_infores=reusabledata_mapping_d,
    mapping_kgregistry_infores=kgregistry_mapping_d
)



# Writing YAML file

This would usually be done by Kedro

In [None]:
save_yaml_file(matrix_subset_relevant_sources, pks_yaml_file)

# Load the integrated PKS YAML file

This would usually be done by the Kedro

In [None]:
matrix_subset_relevant_sources = load_yaml_file(pks_yaml_file)

# Run the documentation pipeline

In [None]:
from nodes import (
    create_pks_documentation
)

pks_markdown_documentation = create_pks_documentation(matrix_subset_relevant_sources)

# Writing the MD file

This would usually be done by Kedro

In [None]:
save_markdown_file(pks_markdown_documentation, pks_md_file)