# Merging RTX and Robokop

This notebook summarizes our results when merging RTX and Robokop. using Translators Node Normalization.
For this we downloaded RTX KG2 v2.7.3 and Robo `c5ec1f282158182f`


## Summary

TODO

## Questions

- How many nodes are merged? %
- How many edges are merged? %
- What are example edges that are not merged, why not?
- How do the triplets differ across the KGs and for the part that is merged across KGs



In [1]:
%%capture
# Import dependencies
import pyspark as ps
import os
from pathlib import Path
import subprocess
import pyspark.sql.functions as f

import pandas as pd

# import spark 
%load_ext autoreload
%autoreload 2
from rich.console import Console
from rich.logging import RichHandler
from rich.panel import Panel
from rich.rule import Rule
console = Console()

# hack that moves this notebook context into the kedro path
root_path = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode().strip()
os.chdir(Path(root_path) / 'pipelines' / 'matrix')

# this loads various objects into the context, see 
# https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks.html#kedro-line-magics
%load_ext kedro.ipython
# %reload_kedro  --env cloud
# %reload_kedro  --env test
%reload_kedro


In [2]:
def top_n(df, n: int = 20):
    return df._jdf.showString(n,50, False)

def wrap_show(df: ps.sql.DataFrame, title, n=20):
    console.print(Panel.fit(top_n(df, n=n), title=title))

In [4]:
%%capture
unified_nodes = catalog.load("integration.prm.unified_nodes")
unified_edges = catalog.load("integration.prm.unified_edges")
robo_nodes = catalog.load("integration.int.robokop.nodes")
robo_edges = catalog.load("integration.int.robokop.edges")
rtx_nodes = catalog.load("integration.int.rtx.nodes")
rtx_edges = catalog.load("integration.int.rtx.edges")
robo_nodes_norm = catalog.load("integration.int.robokop.nodes.norm")
rtx_nodes_norm = catalog.load("integration.int.rtx.nodes.norm")


In [7]:
# %%
robo_norm = catalog.load("integration.int.robokop.nodes.norm")
robo_norm.groupBy("normalization_success").count().show()
# %%
rtx_norm = catalog.load("integration.int.rtx.nodes.norm")
rtx_norm.groupBy("normalization_success").count().show()
# %%

# get top 10 non successful ones for each

def show_norm_sample(df, norm_bool, title, n=20):
    print(title)
    (df
     .filter(f.col("normalization_success") == norm_bool)
     .select("id", "original_id", "normalization_success", "name")
     .withColumn("name", f.substring("name", 0, 50))
     .sample(0.1)
     .show(20, False)
    )



show_norm_sample(robo_norm, True , "Robo normed success")
show_norm_sample(robo_norm, False, "Robo normed failures")
show_norm_sample(rtx_norm, True, "RTX normed success")
show_norm_sample(rtx_norm, False, "RTX normed failures")

+---------------------+-------+
|normalization_success|  count|
+---------------------+-------+
|                 true|4630969|
|                false|5117271|
+---------------------+-------+



+---------------------+-------+
|normalization_success|  count|
+---------------------+-------+
|                 true|1531882|
|                false|2127804|
+---------------------+-------+

Robo normed success
+------------+------------+---------------------+--------------------------------------------------+
|id          |original_id |normalization_success|name                                              |
+------------+------------+---------------------+--------------------------------------------------+
|CHEBI:112336|CHEBI:112336|true                 |2-[[5-[(2,3-dimethylphenoxy)methyl]-4-methyl-1,2,4|
|CHEBI:114153|CHEBI:114153|true                 |N-[[(2R,3S)-5-[(2R)-1-hydroxypropan-2-yl]-8-(3-met|
|CHEBI:114433|CHEBI:114433|true                 |5-[3-[(4-methylphenyl)methoxy]-2-thiophenyl]-3-(me|
|CHEBI:116721|CHEBI:116721|true                 |3-(4-ethoxyphenyl)-4-(phenylmethyl)-1H-1,2,4-triaz|
|CHEBI:120012|CHEBI:120012|true                 |5-bromo-N-[1-(2,5-dimethylpheny

In [49]:
console.rule("Robo Top Norm or Not")
(robo_norm.select("original_id", "normalization_success", "name")
        .withColumn("id_class", f.split("original_id", ":").getItem(0))
        .groupBy("normalization_success", "id_class").count()
        .groupBy("id_class").pivot("normalization_success").sum("count")
        .withColumnsRenamed({"false": "norm_failure", "true": "norm_success"})
        .orderBy(f.desc("norm_failure"))
).show(25, False)

console.rule("RTX Top Norm or Not")
(rtx_norm.select("original_id", "normalization_success", "name")
        .withColumn("id_class", f.split("original_id", ":").getItem(0))
        .groupBy("normalization_success", "id_class").count()
        .groupBy("id_class").pivot("normalization_success").sum("count")
        .withColumnsRenamed({"false": "norm_failure", "true": "norm_success"})
        .orderBy(f.desc("norm_failure"))
).show(25, False)

console.rule("Robo Top ID Group")
(robo_norm.select("original_id", "normalization_success", "name")
        .withColumn("id_class", f.split("original_id", ":").getItem(0))
        .groupBy("id_class").count().orderBy(f.desc("count"))
).show(20, False)

console.rule("RTX Top ID Group")
(rtx_norm.select("original_id", "normalization_success", "name")
        .withColumn("id_class", f.split("original_id", ":").getItem(0))
        .groupBy("id_class").count().orderBy(f.desc("count"))
).show(20, False)

+----------------+------------+------------+
|id_class        |norm_failure|norm_success|
+----------------+------------+------------+
|CAID            |5112649     |NULL        |
|NCBITaxon       |1891        |2541607     |
|PANTHER.FAMILY  |1338        |24800       |
|CHEBI           |375         |192797      |
|GO              |362         |42080       |
|NCBIGene        |170         |183599      |
|UniProtKB       |126         |114377      |
|REACT           |86          |106009      |
|ENSEMBL         |81          |24288       |
|PUBCHEM.COMPOUND|79          |1138366     |
|MONDO           |56          |23956       |
|UMLS            |38          |28375       |
|DRUGBANK        |9           |7           |
|MESH            |8           |2589        |
|EFO             |1           |6586        |
|CL              |1           |2731        |
|NCIT            |1           |21786       |
|SGD             |NULL        |9           |
|UBERON          |NULL        |14407       |
|orphanet 

+-----------------+------------+------------+
|id_class         |norm_failure|norm_success|
+-----------------+------------+------------+
|UMLS             |1240399     |810709      |
|ENSEMBL          |257494      |19303       |
|PathWhiz.Reaction|175039      |NULL        |
|FMA              |99889       |NULL        |
|RXNORM           |98943       |NULL        |
|SMPDB            |38100       |30029       |
|PathWhiz.Compound|32401       |NULL        |
|REACT            |20558       |13370       |
|FOODON           |19001       |NULL        |
|AraPort          |16021       |NULL        |
|VANDF            |13055       |NULL        |
|NCBIGene         |11859       |36376       |
|NDDF             |10962       |NULL        |
|KEGG.REACTION    |10671       |1           |
|ttd.target       |6194        |NULL        |
|ICD9             |6125        |1           |
|PomBase          |5111        |NULL        |
|CHEMBL.TARGET    |5093        |NULL        |
|wb               |4594        |NU

+----------------+-------+
|id_class        |count  |
+----------------+-------+
|CAID            |5112649|
|NCBITaxon       |2543498|
|PUBCHEM.COMPOUND|1138445|
|CHEBI           |193172 |
|NCBIGene        |183769 |
|UniProtKB       |114503 |
|REACT           |106095 |
|PR              |79521  |
|GO              |42442  |
|SMPDB           |30130  |
|UMLS            |28413  |
|PANTHER.FAMILY  |26138  |
|ENSEMBL         |24369  |
|MONDO           |24012  |
|NCIT            |21787  |
|CHEMBL.COMPOUND |21067  |
|HP              |15545  |
|UBERON          |14407  |
|HMDB            |10711  |
|EFO             |6587   |
+----------------+-------+
only showing top 20 rows



+-----------------+-------+
|id_class         |count  |
+-----------------+-------+
|UMLS             |2051108|
|ENSEMBL          |276797 |
|PR               |179370 |
|PathWhiz.Reaction|175039 |
|MESH             |161769 |
|CHEBI            |105323 |
|FMA              |99889  |
|RXNORM           |98943  |
|SMPDB            |68129  |
|CHEMBL.COMPOUND  |66304  |
|NCBIGene         |48235  |
|REACT            |33928  |
|HMDB             |32683  |
|PathWhiz.Compound|32401  |
|UniProtKB        |24685  |
|MONDO            |21663  |
|FOODON           |19001  |
|AraPort          |16021  |
|UBERON           |14101  |
|VANDF            |13055  |
+-----------------+-------+
only showing top 20 rows



## Insight

- Robokop CAID IDs don't normalize, it appears the rest does mostly well. Although there are a few others at <2k node count
- RTX is much more hit/miss and some classes `RXNORM` or `FMA` do not get norm'ed at all. That's weird. We should definitely look into why the nodenorm service doesn't normalize these groups

```
PathWhiz.Reaction
FMA              
RXNORM           
PathWhiz.Compound
FOODON           
AraPort          
VANDF            
```


In [8]:
console.rule("[bold blue]Unified KG")
console.print(Panel.fit(f"""
Unified Nodes: {unified_nodes.count()}
Robo Nodes: {robo_nodes.count()}
RTX Nodes: {rtx_nodes.count()}
""", title="Node Counts"))
# now edges
console.print(Panel.fit(f"""
Unified Edges: {unified_edges.count()}
Robo Edges: {robo_edges.count()}
RTX Edges: {rtx_edges.count()}
""", title="Edge Counts"))
# first calculate the number of nodes and edges in each kg
unified_node_count = unified_nodes.count()
unified_edge_count = unified_edges.count()
robo_node_count = robo_nodes.count()
robo_edge_count = robo_edges.count()
rtx_node_count = rtx_nodes.count()
rtx_edge_count = rtx_edges.count()

# node origin proportions
nodes_in_both = unified_nodes.filter(f.array_contains(f.col("upstream_data_source"), "rtxkg2") & f.array_contains(f.col("upstream_data_source"), "robokop"))
nodes_in_rtx = unified_nodes.filter(f.array_contains(f.col("upstream_data_source"), "rtxkg2"))
nodes_in_robo = unified_nodes.filter(f.array_contains(f.col("upstream_data_source"), "robokop"))

console.print(Panel.fit(
f"""
Nodes originating from RTX: {nodes_in_rtx.count()/unified_node_count*100:.2f}%
Nodes originating from Robo: {nodes_in_robo.count()/unified_node_count*100:.2f}%
Nodes originating from Both: {nodes_in_both.count()/unified_node_count*100:.2f}%
""", title="Node Origin Proportions"))

# edge origin proportions
edges_in_both = unified_edges.filter(f.array_contains(f.col("upstream_data_source"), "rtxkg2") & f.array_contains(f.col("upstream_data_source"), "robokop"))
edges_in_rtx = unified_edges.filter(f.array_contains(f.col("upstream_data_source"), "rtxkg2"))
edges_in_robo = unified_edges.filter(f.array_contains(f.col("upstream_data_source"), "robokop"))

console.print(Panel.fit(
f"""
Edges originating from RTX: {edges_in_rtx.count()/unified_edge_count*100:.2f}%
Edges originating from Robo: {edges_in_robo.count()/unified_edge_count*100:.2f}%
Edges originating from Both: {edges_in_both.count()/unified_edge_count*100:.2f}%
""", title="Edge Origin Proportions"))


                                                                                

                                                                                

## Insight
- 0.5% of edges overlapping, 5% of nodes

## Notes
Wow that's not a lot of edges that are present in both. I wonder why there are so many more edges in Robokop as well. There's like 150M edges there and only 18M in RTX. 
Let's look at the predicate counts:


In [5]:
def stats_on_df(df: ps.sql.DataFrame, col: str, kg_name: str, n=40):
    df_counts = df.groupBy(col).count().sort("count", ascending=False)
    console.print(Panel.fit(top_n(df_counts, n=n), title=f"{col} Counts in {kg_name}"))

stats_on_df(edges_in_both, "predicate", "Both")
stats_on_df(edges_in_rtx, "predicate", "RTX")
stats_on_df(edges_in_robo, "predicate", "Robo")



                                                                                

                                                                                

                                                                                

In [6]:

stats_on_df(nodes_in_both, "category", "Both")
stats_on_df(nodes_in_rtx, "category", "RTX")
stats_on_df(nodes_in_robo, "category", "Robo")

OK it looks like Robokop has tons of biolink `subclass_of` and `is_nearby_variant_of` edges. Also 18M `affects`. RTX on the other hand appears to be heavier on `has_participant` and `occurs_in`

## Doing some plotting. Let's get a correlation matrix of categories in the 4 variants (rtx, robo, overlap, union)

I want to see which categories of nodes are connected with each other. For that I need to join the node categories on the edges dataframe to then the correlation matrix

In [1]:
def get_category_connections(edges: ps.sql.DataFrame, nodes: ps.sql.DataFrame):
    categories = nodes.select("id", "category")
    edges = edges.join(categories.withColumnsRenamed({"id": "subject", "category": "subj_category"}), "subject", "left")
    edges = edges.join(categories.withColumnsRenamed({"id": "object", "category": "obj_category"}), "object", "left")
    # join the nodes dataframe on the subject column of the edges dataframe
    return edges.select("subject", "predicate", "object", "subj_category", "obj_category")


def get_sankey_data_for_kg(edges: ps.sql.DataFrame, nodes: ps.sql.DataFrame) -> pd.DataFrame:
    df = get_category_connections(edges, nodes)
    # preparing sankey diagram data
    df = (df
          .withColumn("subj_category", f.concat(f.lit("sub:"), f.col("subj_category")))
          .withColumn("obj_category", f.concat(f.lit("obj:"), f.col("obj_category")))
    )
    first_level = df.groupBy("subj_category","predicate").count().withColumnsRenamed({"subj_category": "source", "predicate": "sink", "count": "value"})
    second_level = df.groupBy("predicate", "obj_category").count().withColumnsRenamed({"predicate": "source", "obj_category": "sink", "count": "value"})
    return first_level.union(second_level).orderBy("value", ascending=False).toPandas()

import plotly.graph_objects as go
import pandas as pd
import numpy as np

def create_sankey_diagram(df, title):
    # Prepare the data
    all_nodes = pd.concat([df['source'], df['sink']]).unique()
    node_indices = {node: index for index, node in enumerate(all_nodes)}

    # Create color scale
    n_colors = len(all_nodes)
    colors = [f'rgb({r},{g},{b})' for r, g, b in np.random.randint(0, 255, size=(n_colors, 3))]

    # Prepare the Sankey diagram data
    link_source = [node_indices[source] for source in df['source']]
    link_target = [node_indices[sink] for sink in df['sink']]
    link_value = df['value']

    # Create the figure
    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = list(all_nodes),
          color = colors
        ),
        link = dict(
          source = link_source,
          target = link_target,
          value = link_value
    ))])

    # Update the layout
    fig.update_layout(title_text=title, font_size=10, width=1920, height=800)

    return fig

def plot_sankey_for_kg(edges: ps.sql.DataFrame, nodes: ps.sql.DataFrame, title: str, max_categories: int = 100):
    sankey_data = get_sankey_data_for_kg(edges, nodes)
    fig = create_sankey_diagram(sankey_data[:max_categories], title)
    fig.show()

plot_sankey_for_kg(edges_in_robo, nodes_in_robo, "Robo")
plot_sankey_for_kg(edges_in_rtx, nodes_in_rtx, "RTX")
plot_sankey_for_kg(edges_in_both, nodes_in_both, "Both")
plot_sankey_for_kg(unified_edges, unified_nodes, "Unified")

NameError: name 'ps' is not defined