# Example Notebook for Kedro

This jupyter notebook shows how to run the kedro pipeline in a notebook. This assumes you run against a locally executed Neo4J (or with ports forwarded to localhost) and reads the production environment data (i.e. from GCS)

In [None]:
# Import dependencies
import pyspark as ps
import os
from pathlib import Path
import subprocess

In [None]:
# trick that moves this notebook context into the kedro path
root_path = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode().strip()
os.chdir(Path(root_path) / 'pipelines' / 'matrix')

# this loads various objects into the context, see 
# https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks.html#kedro-line-magics
%load_ext kedro.ipython
%reload_kedro  --env cloud
cat = catalog #just a convenience

In [None]:
catalog.list("raw")

In [None]:
# you can easily list elements in the kedro catalog, 
# matching them with regex or substring matching
cat.list("raw.*rtx")

In [None]:
# loading a dataset
df = cat.load("integration.raw.rtx_kg2.nodes@spark")
df.show(10, False) #10 rows, do not truncate

## Reading the raw (CSV) file would take along time

In [None]:

df.select("category").groupBy("category").count().orderBy("count", ascending=False).show(30, False)

# So let's read a parquet file instead which is much more efficient than reading a CSV file

In [None]:
df_parquet = cat.load("integration.prm.rtx_kg2.nodes")

In [None]:
df_parquet.select("category").groupBy("category").count().orderBy("count", ascending=False).show()

In [None]:
# This code quickly grabs the entire content of a kedro node and 
# pastes it into a cell below. This makes interactive node development
# super smooth!
%load_node extract_nodes_edges_from_db

In [None]:
# Prepare necessary inputs for debugging
# All debugging inputs must be defined in your project catalog
dummy = catalog.load("embeddings.model_output.graphsage")
nodes = catalog.load("integration.model_input.nodes")
edges = catalog.load("integration.model_input.edges")

import os
from typing import List, Any, Dict
from neo4j import Driver
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.ml.functions import array_to_vector, vector_to_array
from graphdatascience import GraphDataScience, QueryRunner
from matrix.core import inject_object
from refit.v1.core.unpack import unpack_params

def extract_nodes_edges(dummy, nodes: DataFrame, edges: DataFrame) -> tuple[DataFrame, DataFrame]:
    """Simple node/edge extractor function.

    """
    return nodes, edges


n,e = extract_nodes_edges(dummy, nodes, edges)

In [None]:
e.show(10, False)
n.show(10, False)