# Report for primary knowledge sources in the Matrix KG

This notebook is about drafting a useful high level summary report for KGs highlighting information like licensing for primary knowledge sources and their relevance for drug repurposing modeling.

In [70]:
!pip install pyyaml pandas requests jinja2

Collecting jinja2
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting MarkupSafe>=2.0 (from jinja2)
  Downloading MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (4.0 kB)
Using cached jinja2-3.1.6-py3-none-any.whl (134 kB)
Downloading MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl (12 kB)
Installing collected packages: MarkupSafe, jinja2
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [jinja2]
[1A[2KSuccessfully installed MarkupSafe-3.0.2 jinja2-3.1.6


In [None]:
infores_url = "https://github.com/biolink/information-resource-registry/raw/refs/heads/main/infores_catalog.yaml"
reusabledata_url = "https://github.com/reusabledata/reusabledata/raw/refs/heads/master/data.json"
kgregistry_url = "https://raw.githubusercontent.com/Knowledge-Graph-Hub/kg-registry/refs/heads/main/registry/kgs.yml"

matrixcurated_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQxpQU80dpW9bo7STfrX7k9Wv70jA_2C4BN6tDceM1LEOfF9YL22OisdmaUPf7Ptw/pub?gid=135786799&single=true&output=tsv"
matrixreviews_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQxpQU80dpW9bo7STfrX7k9Wv70jA_2C4BN6tDceM1LEOfF9YL22OisdmaUPf7Ptw/pub?gid=1308154629&single=true&output=tsv"

import yaml
import json
import pandas as pd
import requests
from jinja2 import Template

# Load infores_catalog.yaml
infores_d = yaml.safe_load(requests.get(infores_url).text)

# Load reusabledata.json
reusabledata_d = json.loads(requests.get(reusabledata_url).text)

# Load matrixcurated.tsv
matrixcurated_d = pd.read_csv(matrixcurated_url, sep='\t')

# Load matrixreviews.tsv
matrixreviews_d = pd.read_csv(matrixreviews_url, sep='\t')

# Load kgregistry.yaml
kgregistry_d = yaml.safe_load(requests.get(kgregistry_url).text)

In [64]:
primary_knowledge_sources = {}

def _parse_source(source_data, source_id, id_column, extracted_metadata,ignored_metadata,primary_knowledge_sources):
    for record in source_data:
        raw_id = record[id_column]
        id = raw_id.replace("infores:", "")
        if id not in primary_knowledge_sources:
            
            primary_knowledge_sources[id] = {}

        for key in record:
            if key not in extracted_metadata + ignored_metadata + [id_column]:
                print(f"WARNING: Some potentially useful information from {source_id} is not included in the report: {key}")

        data_extract = {}
        data_extract[id_column] = raw_id
        for element in extracted_metadata:
            if element in record:
                data_extract[element] = record[element]

        primary_knowledge_sources[id][source_id] = data_extract

def _parse_infores(infores_d, primary_knowledge_sources):
    source = 'infores'
    id_column = 'id'
    ignored_metadata = []
    extracted_metadata = ['id','status', 'name', 'description', 'knowledge_level', 'agent_type', 'url', 'xref', 'synonym', 'consumed_by', 'consumes']
    _parse_source(infores_d['information_resources'], source, id_column, extracted_metadata, ignored_metadata, primary_knowledge_sources)

def _parse_reusabledata(reusabledata_d, primary_knowledge_sources):
    source = 'reusabledata'
    id_column = 'id'
    ignored_metadata = ['last-curated', 'grants']
    extracted_metadata = ['description', 'source', 'data-tags', 'grade-automatic', 'source-link', 'source-type', 'status', 'data-field', 'data-type', 'data-categories', 'data-access', 'license', 'license-type', 'license-link', 'license-hat-used', 'license-issues', 'license-commentary', 'license-commentary-embeddable', 'was-controversial', 'provisional', 'contacts']
    _parse_source(reusabledata_d, source, id_column, extracted_metadata, ignored_metadata, primary_knowledge_sources)

def _parse_kgregistry(kgregistry_d, primary_knowledge_sources):
    source = 'kgregistry'
    id_column = 'id'
    ignored_metadata = ['products']
    extracted_metadata = ['', 'activity_status', 'category', 'collection', 'contacts', 'creation_date', 'curators', 'description', 'domains', 'evaluation_page', 'fairsharing_id', 'homepage_url', 'infores_id', 'language', 'last_modified_date', 'layout', 'license', 'name', 'publications', 'repository', 'tags', 'usages', 'version', 'warnings']
    _parse_source(kgregistry_d['resources'], source, id_column, extracted_metadata, ignored_metadata, primary_knowledge_sources)

def _parse_matrixcurated(matrixcurated_d, primary_knowledge_sources):
    # Parse the manually curated pks specific information (mostly licensing information)
    source = 'matrixcurated'
    id_column = 'primary_knowledge_source'
    ignored_metadata = ['aggregator_knowledge_source', 'number_of_edges', 'infores_name', 'xref']
    extracted_metadata = ['license_name', 'license_source_link']
    _parse_source(matrixcurated_d.to_dict(orient="records"), source, id_column, extracted_metadata, ignored_metadata, primary_knowledge_sources)

def _parse_matrixreviews(matrixreviews_d, primary_knowledge_sources):
    # Parse the manually curated pks reviews according to the rubric
    source = 'matrixreviews'
    id_column = 'primary_knowledge_source'
    ignored_metadata = ['infores_name']
    extracted_metadata = ['domain_coverage_score', 'domain_coverage_comments', 'source_scope_score', 'source_scope_score_comment', 'utility_drugrepurposing_score', 'utility_drugrepurposing_comment', 'label_rubric', 'label_rubric_rationale', 'label_manual', 'label_manual_comment', 'reviewer']
    _parse_source(matrixreviews_d.to_dict(orient="records"), source, id_column, extracted_metadata, ignored_metadata, primary_knowledge_sources)

_parse_infores(infores_d, primary_knowledge_sources)
_parse_reusabledata(reusabledata_d, primary_knowledge_sources)
_parse_kgregistry(kgregistry_d, primary_knowledge_sources)
_parse_matrixcurated(matrixcurated_d, primary_knowledge_sources)
_parse_matrixreviews(matrixreviews_d, primary_knowledge_sources)



In [65]:
def _create_subset_relevant_to_matrix(primary_knowledge_sources, relevant_sources):
    subset = {}
    for source in relevant_sources:
        if source in primary_knowledge_sources:
            subset[source] = primary_knowledge_sources[source]
    return subset

# Remove "infores:" prefix from each source in relevant_sources
relevant_sources = [src.replace("infores:", "") for src in matrixcurated_d['primary_knowledge_source'].unique().tolist()]
matrix_subset_relevant_sources = _create_subset_relevant_to_matrix(primary_knowledge_sources, relevant_sources)

In [71]:
from jinja2 import Template

In [82]:
def _get_property( source_info, property, default_value = "Unknown"):
    property_value = default_value
    if 'infores' in source_info and property in source_info['infores']:
        property_value = source_info['infores'][property]
    elif 'kgregistry' in source_info and property in source_info['kgregistry']:
        property_value = source_info['kgregistry'][property]
    elif 'reusabledata' in source_info and property in source_info['reusabledata']:
        property_value = source_info['reusabledata'][property]
    return property_value
    
def _generate_list_of_pks_docs(source_data):
    pks_jinja2_template = Template("""### Source: {{ title }} ({{ id }})

_{{ description }}_

- **License**: {{ license }}""")

    pks_documentation_texts = []
    for source_id, source_info in source_data.items():
        name = _get_property(source_info, 'name', default_value="No name")
        description = _get_property(source_info, 'description', default_value="No description.")
        license = "ARG"
        pks_docstring = pks_jinja2_template.render(
            id = source_id,
            title=name,
            description=description,
            license=license
        )
        pks_documentation_texts.append(pks_docstring)
    return pks_documentation_texts

pks_documentation_texts = _generate_list_of_pks_docs(matrix_subset_relevant_sources)

def _generate_pks_docs(pks_documentation_texts):
    pks_jinja2_template = Template("""## {{ title }}
                                   
This page is automatically generated with curated information about primary knowledge sources
leveraged in the MATRIX Knowledge Graph, mainly regarding licensing information and 
potential relevancy assessments for drug repurposing.

This internally curated information is augmented with information from three external resources:

1. [Information Resource Registry](https://biolink.github.io/information-resource-registry/)
2. [reusabledata.org](https://reusabledata.org/)
3. [KG Registry](https://kghub.org/kg-registry/)

{% for doc in pks_documentation_texts %}
{{ doc }}
{% endfor %}
""")

    pks_docs = pks_jinja2_template.render(
        title="Overview of Matrix KG Primary Knowledge Sources",
        pks_documentation_texts=pks_documentation_texts
    )
    return pks_docs

document_string = _generate_pks_docs(pks_documentation_texts)
#write document_string to primary-knowledge-sources.md
with open("primary-knowledge-sources.md", "w") as f:
    f.write(document_string)

In [None]:
# safe matrix_subset_relevant_sources as yaml file
with open("matrix_subset_relevant_sources.yaml", "w") as f:
    yaml.dump(matrix_subset_relevant_sources, f)
