In [2]:
# notebook setup borrowed from 2024/merging_rtx_robokop.ipynb 
import os
from pathlib import Path
import subprocess

from pyspark.sql.functions import split

%load_ext autoreload
%autoreload 2
from rich.console import Console
from rich.logging import RichHandler
from rich.panel import Panel
from rich.rule import Rule
console = Console()



# hack that moves this notebook context into the kedro path
root_path = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode().strip()
os.chdir(Path(root_path) / 'pipelines' / 'matrix')

# this loads various objects into the context, see 
# https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks.html#kedro-line-magics
%load_ext kedro.ipython
# %reload_kedro  --env cloud
# %reload_kedro  --env test
%reload_kedro


In [3]:
unified_nodes = catalog.load("integration.prm.unified_nodes")
unified_edges = catalog.load("integration.prm.unified_edges")

:: loading settings :: url = jar:file:/Users/kschaper/Monarch/everycure/matrix/pipelines/matrix/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/kschaper/.ivy2/cache
The jars for the packages stored in: /Users/kschaper/.ivy2/jars
com.google.cloud.spark#spark-3.5-bigquery added as a dependency
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
org.xerial#sqlite-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ac89ee40-ff11-452e-8865-dcbe5544972b;1.0
	confs: [default]
	found com.google.cloud.spark#spark-3.5-bigquery;0.39.0 in central
	found com.google.cloud.spark#spark-bigquery-dsv2-common;0.39.0 in central
	found com.google.cloud.spark#spark-bigquery-connector-common;0.39.0 in central
	found com.google.cloud.spark#bigquery-connector-common;0.39.0 in central
	found com.google.api.grpc#grpc-google-cloud-bigquerystorage-v1;3.5.1 in central
	found io.grpc#grpc-api;1.64.0 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found com.google.errorprone#error_prone_annotations;2.23.0 in central
	found io.grpc#grpc-stub;1.64.0 in c

In [5]:
unified_nodes.count(), unified_edges.count()

[1m([0m[1;36m6385010[0m, [1;36m22044943[0m[1m)[0m

In [29]:
console.rule("Unified Nodes Aggregate and Count")
unified_nodes = unified_nodes.withColumn("prefix", split("id", ":")[0])

columns = ["category", "prefix", "upstream_data_source"]
unified_nodes_agg_count = unified_nodes.select(*columns).groupBy(*columns).count()

unified_nodes_agg_count.show()



[Stage 44:>                                                       (0 + 10) / 11]

+--------------------+----------------+--------------------+-----+
|            category|          prefix|upstream_data_source|count|
+--------------------+----------------+--------------------+-----+
|  biolink:NamedThing|             IDO|            [rtxkg2]|   13|
|        biolink:Cell|   CHEMBL.TARGET|            [rtxkg2]|   15|
|biolink:Phenotypi...|             EFO|            [rtxkg2]|   63|
|        biolink:Food|        DRUGBANK|            [rtxkg2]|  106|
|        biolink:Cell|        DRUGBANK|            [rtxkg2]|    4|
|biolink:Physiolog...|            NCIT|            [rtxkg2]|   10|
|biolink:Biologica...|            NCIT|            [rtxkg2]|  158|
|biolink:ChemicalE...|PUBCHEM.COMPOUND|            [rtxkg2]|29478|
|        biolink:Drug|            UMLS|            [rtxkg2]|19377|
|biolink:Biologica...|           MPATH|            [rtxkg2]|   38|
|biolink:Phenotypi...|         biolink|            [rtxkg2]|   14|
|        biolink:Drug|             PDQ|            [rtxkg2]|  

                                                                                

In [30]:
console.rule("Unified Edges Aggregate and Count")

subject_nodes = unified_nodes.alias("subject_nodes")
object_nodes = unified_nodes.alias("object_nodes")

unified_edges_agg_count = unified_edges.withColumn("subject_prefix", split("subject", ":")[0])\
    .withColumn("object_prefix", split("object", ":")[0])\
    .join(subject_nodes.select("id", "category").withColumnRenamed("category", "subject_category"), unified_edges.subject == subject_nodes.id, "left")\
    .join(object_nodes.select("id", "category").withColumnRenamed("category", "object_category"), unified_edges.object == object_nodes.id, "left")\
    .select("subject_prefix", "object_prefix", "predicate", "subject_category", "object_category")\
    .groupBy("subject_prefix", "object_prefix","predicate", "subject_category", "object_category").count()

unified_edges_agg_count.show()



+--------------+---------------+--------------------+--------------------+--------------------+-----+
|subject_prefix|  object_prefix|           predicate|    subject_category|     object_category|count|
+--------------+---------------+--------------------+--------------------+--------------------+-----+
|         CHEBI|            ATC| biolink:subclass_of|        biolink:Drug|biolink:ChemicalE...|  110|
|        UBERON|           CARO| biolink:close_match|biolink:Anatomica...|biolink:Anatomica...|    3|
|         CHEBI|          CHEBI|biolink:physicall...|biolink:NucleicAc...|        biolink:Drug|  128|
|      DRUGBANK|          CHEBI|biolink:physicall...|        biolink:Drug|biolink:SmallMole...|  142|
|          NCIT|          CHEBI|    biolink:has_part|        biolink:Drug|        biolink:Drug|    5|
|           FMA|          CHEBI|biolink:entity_po...|        biolink:Drug|biolink:SmallMole...|   63|
|         CHEBI|CHEMBL.COMPOUND|  biolink:located_in|        biolink:Drug|        

                                                                                