# L4: API Discovery with Knowledge Graphs

<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>

In [None]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Setup

In [None]:
from rdflib import Dataset
import pandas as pd

from faiss import IndexFlatL2, IndexFlat
import numpy as np
import tqdm
from langchain_openai import OpenAIEmbeddings
from helper import parameterize_sparql

<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>Access <code>requirements.txt</code> and <code>helper.py</code> files:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>.

<p> ⬇ &nbsp; <b>Download Notebooks:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Download as"</em> and select <em>"Notebook (.ipynb)"</em>.</p>
</div>

In [None]:
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", None)

In [None]:
from helper import get_openai_api_key
openai_api_key = get_openai_api_key()

In [None]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
graph = Dataset(default_union=True)
graph.parse("./ro_shared_data/odata_knowledge_graph.ttl",
            format="turtle")

## Generating the embedding strings

In [None]:
q_api_properties = """PREFIX odata: <http://example.org/odata#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT
    ?entity_set ?property_label

WHERE {
    BIND(<http://data.example.org/Service/API_PURCHASEORDER_2/EntitySet/PURCHASEORDER> 
    as ?entity_set_uri)
    ?entity_set_uri rdf:type odata:EntitySet.
    ?entity_set_uri odata:name ?entity_set.
    ?entity_set_uri odata:entityType ?entity_type_uri.


    # Properties on the EntitySet
    {
        ?entity_type_uri odata:property ?property_uri.
        ?property_uri odata:label ?property_label.
        # ?property_uri odata:selectProperty true
    }
}
"""

In [None]:
df_dict = {
    "entity_set": [],
    "property_label": [],
}
for row in graph.query(q_api_properties):
    df_dict["entity_set"].append(str(row.entity_set))
    df_dict["property_label"].append(str(row.property_label))


In [None]:
pd.DataFrame(df_dict).head(15)

In [None]:
q_embedding_string = """
PREFIX odata: <http://example.org/odata#>
SELECT 
?entity_set_uri (CONCAT(CONCAT(CONCAT("entity set: ", ?entity_set_name), "; properties: "), group_concat(?property_label;separator=", ")) 
    AS ?embedding_string)
WHERE
{
	?entity_set_uri a odata:EntitySet ;
        odata:name ?entity_set_name ;
        odata:entityType/odata:property/odata:label ?property_label .
}
GROUP BY ?entity_set_uri ?entity_set_name
"""

In [None]:
str(next(iter(graph.query(q_embedding_string))).embedding_string)

## Generate Embeddings

In [None]:
embeddings = []
entity_set_uris = []

for row in tqdm.tqdm(graph.query(q_embedding_string)):
    embedding = embedding_model.embed_query(row.embedding_string)
    embeddings.append(embedding)
    entity_set_uris.append(str(row.entity_set_uri))

In [None]:
xb = np.array(embeddings).astype("float32")
index = IndexFlatL2(xb.shape[1])
index.add(xb)

In [None]:
import pickle

with open("../ro_shared_data/entity_sets_index.pickle", "wb") as file:
    pickle.dump(index, file)

with open("../ro_shared_data/entity_set_uris.pickle", "wb") as file:
    pickle.dump(entity_set_uris, file)

## Query the index

In [None]:
def query_index(
    index: IndexFlat,
    entity_set_uris: list[str],
    embedding_model: OpenAIEmbeddings,
    query: str,
    top: int = 5,
) -> list[str]:
    x_query = np.array([embedding_model.embed_query(query)])
    _, indices = index.search(x_query, top)
    return [entity_set_uris[i] for i in indices[0]]

In [None]:
query_index(
    index=index,
    entity_set_uris=entity_set_uris,
    embedding_model=embedding_model,
    query="""Create a purchase order for 5 pencils in 
    purchasing group 002 and purchasing organization 3000""",
    top=5,
)

## Enhace retrieval with process information

In [None]:
q_get_process_dependencies = """
    PREFIX pr: <http://example.org/process#>
    PREFIX odata: <http://example.org/odata#>
    SELECT DISTINCT ?entitySetA ?entitySetB ?nameA ?nameB
    WHERE {
        {
        VALUES ?entitySetA { var:::entity_set_uris }
        ?activityA  pr:entitySet ?entitySetA ;
                    pr:hasNext ?activityB . 
        
        ?activityB pr:entitySet ?entitySetB .
        ?entitySetA odata:name ?nameA .
        ?entitySetB odata:name ?nameB .
        } 
        UNION {
        VALUES ?entitySetB { var:::entity_set_uris }
        ?activityA  pr:entitySet ?entitySetA ;
                    pr:hasNext ?activityB . 
        
        ?activityB pr:entitySet ?entitySetB .
        ?entitySetA odata:name ?nameA .
        ?entitySetB odata:name ?nameB .
        }
    }
    """

In [None]:
def get_process_dependencies(
    entity_set_uris: list[str], graph: Dataset
) -> list[tuple[str, str, str, str]]:
    entity_set_uris = " ".join([f"<{uri}>" for uri in entity_set_uris])
    return [
        (str(row.entitySetA), str(row.entitySetB), str(row.nameA), str(row.nameB))
        for row in graph.query(
            parameterize_sparql(
                query=q_get_process_dependencies,
                parameters={"entity_set_uris": entity_set_uris},
            )
        )
    ]

In [None]:
retrieved_entity_set_uris = query_index(
    index=index,
    entity_set_uris=entity_set_uris,
    embedding_model=embedding_model,
    query="""Create a purchase order for 5 pencils in
    purchasing group 002 and purchasing organization 3000""",
    top=5,
)

get_process_dependencies(entity_set_uris=retrieved_entity_set_uris, 
                         graph=graph)

## Putting everything together

In [None]:
def discover_apis_and_process(
    query: str,
    graph: Dataset,
    index: IndexFlat,
    entity_set_uris: list[str],
    embedding_model: OpenAIEmbeddings,
) -> dict:
    
    retrieved_entity_set_uris = query_index(
        index=index,
        entity_set_uris=entity_set_uris,
        embedding_model=embedding_model,
        query=query,
        top=5,
    )

    dependencies = get_process_dependencies(
        entity_set_uris=retrieved_entity_set_uris, graph=graph
    )
    
    merged_entity_sets = set(retrieved_entity_set_uris)

    process_information = []
    for dependency in dependencies:
        merged_entity_sets.add(dependency[0])
        merged_entity_sets.add(dependency[1])
        process_information.append(
            f"{dependency[3]} depends on {dependency[2]}")

    return {
        "entity_sets": merged_entity_sets,
        "process_information": process_information,
    }

In [None]:
discover_apis_and_process(
    query="""Create a purchase order for 5 pencils in
    purchasing group 002 and purchasing organization 3000""",
    graph=graph,
    index=index,
    entity_set_uris=entity_set_uris,
    embedding_model=embedding_model,
)