In [1]:
# notebook setup borrowed from 2024/merging_rtx_robokop.ipynb 
import os
from pathlib import Path
import subprocess

from pyspark.sql.functions import split

%load_ext autoreload
%autoreload 2
from rich.console import Console
from rich.logging import RichHandler
from rich.panel import Panel
from rich.rule import Rule
console = Console()



# hack that moves this notebook context into the kedro path
root_path = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode().strip()
os.chdir(Path(root_path) / 'pipelines' / 'matrix')

# this loads various objects into the context, see 
# https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks.html#kedro-line-magics
%load_ext kedro.ipython
# %reload_kedro  --env cloud
# %reload_kedro  --env test
%reload_kedro


In [2]:
unified_nodes = catalog.load("integration.prm.unified_nodes")
unified_edges = catalog.load("integration.prm.unified_edges")

:: loading settings :: url = jar:file:/Users/kschaper/Monarch/everycure/matrix/pipelines/matrix/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/kschaper/.ivy2/cache
The jars for the packages stored in: /Users/kschaper/.ivy2/jars
com.google.cloud.spark#spark-3.5-bigquery added as a dependency
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
org.xerial#sqlite-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-89ab3a22-c2e1-423f-88dd-8c7edf4c3625;1.0
	confs: [default]
	found com.google.cloud.spark#spark-3.5-bigquery;0.39.0 in central
	found com.google.cloud.spark#spark-bigquery-dsv2-common;0.39.0 in central
	found com.google.cloud.spark#spark-bigquery-connector-common;0.39.0 in central
	found com.google.cloud.spark#bigquery-connector-common;0.39.0 in central
	found com.google.api.grpc#grpc-google-cloud-bigquerystorage-v1;3.5.1 in central
	found io.grpc#grpc-api;1.64.0 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found com.google.errorprone#error_prone_annotations;2.23.0 in central
	found io.grpc#grpc-stub;1.64.0 in c

In [11]:
print(unified_nodes.count(), unified_edges.count())
unified_nodes.show(3)
unified_edges.show(3)

14909314 154389126
+-----------+--------------------+--------------------+--------------------+----------------------+--------------------+------------+--------------------+---------------------------------+--------------------+------+
|         id|                name|            category|         description|equivalent_identifiers|      all_categories|publications|              labels|international_resource_identifier|upstream_data_source|prefix|
+-----------+--------------------+--------------------+--------------------+----------------------+--------------------+------------+--------------------+---------------------------------+--------------------+------+
|AEO:0000127|apoptosing develo...|biolink:Anatomica...|An embryonic anat...|         [AEO:0000127]|[biolink:Anatomic...|          []|[biolink:Anatomic...|             http://purl.oboli...|            [rtxkg2]|   AEO|
|ARO:3004420|chloramphenicol a...|  biolink:NamedThing|Commercial colori...|         [ARO:3004420]|[biolink:Named

In [4]:
console.rule("Unified Nodes Aggregate and Count")
unified_nodes = unified_nodes.withColumn("prefix", split("id", ":")[0])

columns = ["category", "prefix", "upstream_data_source"]
unified_nodes_agg_count = unified_nodes.select(*columns).groupBy(*columns).count()

unified_nodes_agg_count.show()



                                                                                

+--------------------+----------------+--------------------+-------+
|            category|          prefix|upstream_data_source|  count|
+--------------------+----------------+--------------------+-------+
|biolink:ChemicalE...|        DRUGBANK|           [robokop]|    166|
|        biolink:Drug|           CHEBI|           [robokop]|    413|
|biolink:Phenotypi...|             EFO|            [rtxkg2]|     32|
|  biolink:NamedThing|             IDO|            [rtxkg2]|     13|
|     biolink:Pathway|           MONDO|   [rtxkg2, robokop]|    149|
|        biolink:Cell|   CHEMBL.TARGET|            [rtxkg2]|     15|
|        biolink:Food|        DRUGBANK|            [rtxkg2]|    105|
|biolink:Molecular...|           REACT|           [robokop]|  73116|
|biolink:OrganismT...|       NCBITaxon|           [robokop]|2466405|
|     biolink:Disease|            OMIM|           [robokop]|     47|
|biolink:ChemicalE...|         biolink|            [rtxkg2]|      1|
|biolink:Physiolog...|            

In [12]:
console.rule("Unified Edges Aggregate and Count")

subject_nodes = unified_nodes.alias("subject_nodes")
object_nodes = unified_nodes.alias("object_nodes")

columns = ["subject_prefix", 
           "subject_category", 
           "predicate", 
           "object_prefix", 
           "object_category", 
           "primary_knowledge_source", 
           "aggregator_knowledge_source", 
           "upstream_data_source"]

unified_edges_agg_count = unified_edges.withColumn("subject_prefix", split("subject", ":")[0])\
    .withColumn("object_prefix", split("object", ":")[0])\
    .join(subject_nodes.select("id", "category").withColumnRenamed("category", "subject_category"), unified_edges.subject == subject_nodes.id, "left")\
    .join(object_nodes.select("id", "category").withColumnRenamed("category", "object_category"), unified_edges.object == object_nodes.id, "left")\
    .select(*columns)\
    .groupBy(*columns).count()

unified_edges_agg_count.show()



+--------------+--------------------+--------------------+-------------+--------------------+------------------------+---------------------------+--------------------+-----+
|subject_prefix|    subject_category|           predicate|object_prefix|     object_category|primary_knowledge_source|aggregator_knowledge_source|upstream_data_source|count|
+--------------+--------------------+--------------------+-------------+--------------------+------------------------+---------------------------+--------------------+-----+
|      NCBIGene|biolink:NucleicAc...|biolink:physicall...|        CHEBI|        biolink:Drug|        infores:drugbank|         [infores:drugbank]|            [rtxkg2]|  425|
|         REACT|biolink:Molecular...|  biolink:has_output|        CHEBI|        biolink:Drug|        infores:reactome|         [infores:reactome]|   [rtxkg2, robokop]| 1251|
|         CHEBI|biolink:SmallMole...|biolink:entity_po...|        CHEBI|        biolink:Drug|        infores:semmeddb|         [in

                                                                                

The steps below assume that you've run
```
kedro run --nodes='generate_filtered_nodes_agg_count,generate_filtered_edges_agg_count'
```

or maybe:
```
kedro run --to-nodes='generate_filtered_nodes_agg_count,generate_filtered_edges_agg_count'
```

The steps below will be the process of providing a duckdb that evidence.dev can read from

In [31]:
!uv pip install duckdb

[2K[2mResolved [1m1 package[0m [2min 315ms[0m[0m                                          [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m     0 B/14.75 MiB                     [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 8.00 KiB/14.75 MiB                    [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 16.00 KiB/14.75 MiB                   [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 32.00 KiB/14.75 MiB                   [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 48.00 KiB/14.75 MiB                   [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 64.00 KiB/14.75 MiB                   [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 80.00 KiB/14.75 MiB                   [1A
[2K[1A[37m⠙[0m [2mPre

In [4]:
filtered_nodes_agg_count = catalog.load("integration.prm.filtered_nodes_agg_count")
filtered_edges_agg_count = catalog.load("integration.prm.filtered_edges_agg_count")
print(filtered_nodes_agg_count.count(), filtered_edges_agg_count.count())
filtered_nodes_agg_count.show()
filtered_edges_agg_count.show()

:: loading settings :: url = jar:file:/Users/kschaper/Monarch/everycure/matrix/pipelines/matrix/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/kschaper/.ivy2/cache
The jars for the packages stored in: /Users/kschaper/.ivy2/jars
com.google.cloud.spark#spark-3.5-bigquery added as a dependency
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
org.xerial#sqlite-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-81134794-9c2b-47e2-9176-a86a988885f9;1.0
	confs: [default]
	found com.google.cloud.spark#spark-3.5-bigquery;0.39.0 in central
	found com.google.cloud.spark#spark-bigquery-dsv2-common;0.39.0 in central
	found com.google.cloud.spark#spark-bigquery-connector-common;0.39.0 in central
	found com.google.cloud.spark#bigquery-connector-common;0.39.0 in central
	found com.google.api.grpc#grpc-google-cloud-bigquerystorage-v1;3.5.1 in central
	found io.grpc#grpc-api;1.64.0 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found com.google.errorprone#error_prone_annotations;2.23.0 in central
	found io.grpc#grpc-stub;1.64.0 in c

755 88019
+--------------------+----------------+--------------------+-------+
|            category|          prefix|upstream_data_source|  count|
+--------------------+----------------+--------------------+-------+
|biolink:ChemicalE...|        DRUGBANK|           [robokop]|    166|
|        biolink:Drug|           CHEBI|           [robokop]|    413|
|     biolink:Pathway|           MONDO|   [rtxkg2, robokop]|    149|
|        biolink:Cell|   CHEMBL.TARGET|            [rtxkg2]|     15|
|biolink:Phenotypi...|             EFO|            [rtxkg2]|     32|
|        biolink:Food|        DRUGBANK|            [rtxkg2]|    105|
|biolink:Molecular...|           REACT|           [robokop]|  73116|
|     biolink:Disease|            MESH|   [rtxkg2, robokop]|     10|
|biolink:OrganismT...|       NCBITaxon|           [robokop]|2466405|
|     biolink:Disease|            OMIM|           [robokop]|     47|
|biolink:Biologica...|            NCIT|            [rtxkg2]|    155|
|biolink:ChemicalE...|PU

In [14]:
import os
from pathlib import Path
import subprocess

# hack that moves this notebook context into the kedro path
root_path = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode().strip()
os.chdir(Path(root_path) / 'pipelines' / 'matrix')

%load_ext kedro.ipython
%reload_kedro


import duckdb 
import tempfile
from pathlib import Path

con = duckdb.connect("matrix-kg.duckdb")

# define path to ../../kg_dashboard/sources/reports
dashboard_data_path = Path(root_path) / "services" / "kg_dashboard" / "sources" / "reports"

# Write Spark DataFrames to Parquet files
catalog.load("integration.prm.filtered_nodes_agg_count").coalesce(1).write.format("parquet").mode('overwrite').save(str(dashboard_data_path / "nodes"))
catalog.load("integration.prm.filtered_edges_agg_count").coalesce(1).write.format("parquet").mode('overwrite').save(str(dashboard_data_path / "edges"))

# Define Parquet files
nodes_parquet_file = dashboard_data_path / "merged_kg_nodes_report.parquet"
edges_parquet_file = dashboard_data_path / "merged_kg_edges_report.parquet"

# move the uuid named node & edge files to the names expected by the dashboard
!mv {dashboard_data_path / "nodes/*.parquet"} {nodes_parquet_file} 
!mv {dashboard_data_path / "edges/*.parquet"} {edges_parquet_file}

# clean up the directories with uuid named parquet files
!rm -rf {dashboard_data_path / "nodes"}
!rm -rf {dashboard_data_path / "edges"}


The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython
