In [1]:
import json 
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

## Loading openai key
with open('../environ.json') as f:
    config = json.load(f)
    api_keys = list(config.values())[0]

# assign openai key to an environment variable
os.environ["OPENAI_API_KEY"] = api_keys

# # check to confirm

In [2]:
# Define Neo4j connections
from neo4j import GraphDatabase
host = 'bolt://127.0.0.1:7687'
user = 'admin'
password = 'cran2graph'
driver = GraphDatabase.driver(host,auth=(user, password))

def run_query(query, params={}):
    with driver.session(database="cran") as session:
        result = session.run(query, params)
        return result.to_df()

In [3]:
# establish connection to neo4j graph database
graph = Neo4jGraph(
    url="bolt://127.0.0.1:7687", username="admin", password="cran2graph", database="neo4j"
)

In [4]:
# check the graph schema
print(graph.schema)


        Node properties are the following:
        [{'labels': 'Person', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'Package', 'properties': [{'property': 'latestPublishedDate', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'md5sum', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'license', 'type': 'STRING'}, {'property': 'version', 'type': 'STRING'}]}, {'labels': 'Organization', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'License', 'properties': [{'property': 'type', 'type': 'STRING'}]}, {'labels': 'Institution', 'properties': [{'property': 'type', 'type': 'STRING'}]}]
        Relationship properties are the following:
        []
        The relationships are the following:
        ['(:Person)-[:CONTRIBUTED_TO]->(:Package)', '(:Person)-[:WORKS_AT]->(:Institution)', '(:Person)-[:MAINTAINS]->(:Package)', '(:Package)-[:DEPENDS_ON]->(:Package)', '(:Package)-[:LICENSE_BY]->(:License

In [5]:
query = """

MATCH (p:Package)
MATCH (p)-[r:CONTRIBUTED_TO|MAINTAINS]-(n:Person)
WITH p, type(r) as type, collect(n.name) as names
WITH p, type+": "+reduce(s="", n IN names | s + n + ", ") as types
WITH p, collect(types) as contexts
WITH p, "Package Name: "+ p.name + " Last Published Date: "+p.latestPublishedDate +" Description: "+ p.description +"\n" +
        reduce(s="", c in contexts | s + substring(c, 0, size(c)-2) +"\n") as context
RETURN context Limit 1

"""

In [6]:
run_query("""
CALL apoc.periodic.iterate(
    'MATCH (p:Package) RETURN id(p) as id',
    'MATCH (p:Package)
    WHERE id(p) = id
    MATCH (p)-[r:CONTRIBUTED_TO|MAINTAINS]-(n:Person)
    WITH p, type(r) as type, collect(n.name) as names
    WITH p, type+": "+reduce(s="", n IN names | s + n + ", ") as types
    WITH p, collect(types) as contexts
    WITH p, "Package Name: "+ p.name + " Last Published Date: "+p.latestPublishedDate +" Description: "+ p.description +"\n" +
            reduce(s="", c in contexts | s + substring(c, 0, size(c)-2) +"\n") as context
    CALL apoc.ml.openai.embedding([context], $apiKey) YIELD embedding
    SET p.embedding = embedding',
    {batchSize:1, retries:3, params: {apiKey: $apiKey}})
""", {'apiKey': api_keys})['errorMessages'][0]
        
        

In [4]:
system_prompt = """
You are an assistant that helps to generate text to form nice and human understandable answers based.
The latest prompt contains the information, and you need to generate a human readable response based on the given information.
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
Do not add any additional information that is not explicitly provided in the latest prompt.
I repeat, do not add any information that is not explicitly given.
"""

In [39]:
def generate_user_prompt(question, context):
    return f"""
    The question is {question}
    Answer the question by using the provided information:
    {context}
    """

In [40]:
def retrieve_context(question, k=3):
    data = run_query("""
        // retrieve the embedding of the question
        CALL apoc.ml.openai.embedding([$question], $apiKey) YIELD embedding 
        // match relevant packages
        MATCH (p:Package)
        WITH p, gds.similarity.cosine(embedding, p.embedding) AS score
        ORDER BY score DESC
        // LIMIT the number of relevant description
        LIMIT toInteger($k)
        // retrieve graph context 
        MATCH (p)--()--(p1:Package)
        WITH p, p1, count(*) AS count
        ORDER BY count DESC
        WITH p, apoc.text.join(collect(p1.package)[..3], ", ") as similarPackage
        MATCH (p)-[r:CONTRIBUTED_TO|MAINTAINS]-(n:Person)
        WITH p, similarPackage, type(r) as type, collect(n.person) as names
        WITH p, similarPackage, type+": "+reduce(s="", n IN names | s + n + ", ") as types
        WITH p, similarPackage, collect(types) as contexts
        WITH p, similarPackage, "Package Name: "+ p.package + " year: "+p.published +" Description: "+ p.description +"\n" +
                reduce(s="", c in contexts | s + substring(c, 0, size(c)-2) +"\n")  + "similar packages:" + similarPackage + "\n" as context
        RETURN context  
    """, {'question': question, 'k': k, 'apiKey': api_keys})
    return data['context'].to_list()

In [41]:
def generate_answer(question):
    # Retrieve context
    context = retrieve_context(question)
    # Print context
    for c in context:
        print(c)
    # Generate answer
    response = run_query(
        """
  CALL apoc.ml.openai.chat([{role:'system', content: $system},
                      {role: 'user', content: $user}], $apiKey) YIELD value
  RETURN value.choices[0].message.content AS answer
  """,
        {
            "system": system_prompt,
            "user": generate_user_prompt(question, context),
            "apiKey": api_keys,
        },
    )
    return response["answer"][0]

In [42]:
generate_answer("Who maintains dplyr package?")

Package Name: dplyr year: 2023-09-03 Description: A fast, consistent tool for working with data frame like
    objects, both in memory and out of memory.
CONTRIBUTED_TO: Hadley Wickham, fnd], Posit Software, Davis Vaughan, Kirill Müller, PBC [cph, Lionel Henry, Romain François
similar packages:googledrive, rsample, recipes

Package Name: dtplyr year: 2023-03-22 Description: Provides a data.table backend for dplyr. The goal of
    dtplyr is to allow you to write dplyr code that is automatically
    translated to the equivalent, but usually much faster, data.table
    code.
CONTRIBUTED_TO: Hadley Wickham, Mark Fairbanks, Ryan Dickerson, fnd], Posit Software, PBC [cph, Maximilian Girlich
similar packages:recipes, tidyr, tune

Package Name: dtrackr year: 2023-09-04 Description: Track and
    document dplyr data pipelines. As you filter, mutate, and join your
    way through a data set, dtrackr seamlessly keeps track of your data
    flow and makes publication ready documentation of a data 

'The dplyr package is maintained by Hadley Wickham, fnd], Posit Software, Davis Vaughan, Kirill Müller, PBC [cph, Lionel Henry, and Romain François. These individuals and organizations contribute to the development and maintenance of the package.'