In [6]:
import json 
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

## Loading openai key
with open('../environ.json') as f:
    config = json.load(f)
    api_keys = list(config.values())[1]

# assign openai key to an environment variable
os.environ["OPENAI_API_KEY"] = api_keys

# check to confirm


In [3]:
# Define Neo4j connections
from neo4j import GraphDatabase
host = 'bolt://127.0.0.1:7689'
user = 'admin'
password = 'cran2graph'
driver = GraphDatabase.driver(host,auth=(user, password))

def run_query(query, params={}):
    with driver.session(database="neo4j") as session:
        result = session.run(query, params)
        return result.to_df()

In [4]:
# establish connection to neo4j graph database
graph = Neo4jGraph(
    url="bolt://127.0.0.1:7689", username="admin", password="cran2graph", database="neo4j"
)

In [5]:
# check the graph schema
print(graph.schema)


        Node properties are the following:
        [{'labels': 'Person', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'Package', 'properties': [{'property': 'embedding', 'type': 'LIST'}, {'property': 'latestPublishedDate', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'md5sum', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'license', 'type': 'STRING'}, {'property': 'version', 'type': 'STRING'}]}, {'labels': 'Organization', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'License', 'properties': [{'property': 'type', 'type': 'STRING'}]}, {'labels': 'Institution', 'properties': [{'property': 'type', 'type': 'STRING'}]}]
        Relationship properties are the following:
        []
        The relationships are the following:
        ['(:Person)-[:CONTRIBUTED_TO]->(:Package)', '(:Person)-[:WORKS_AT]->(:Institution)', '(:Person)-[:MAINTAINS]->(:Package)', '(:Package)-[:DEPENDS_ON]->(:Pack

In [8]:
query = """

MATCH (p:Package)
MATCH (p)-[r:CONTRIBUTED_TO|MAINTAINS]-(n:Person)
WITH p, type(r) as type, collect(n.name) as names
WITH p, type+": "+reduce(s="", n IN names | s + n + ", ") as types
WITH p, collect(types) as contexts
WITH p, "Package Name: "+ p.name + " Last Published Date: "+p.latestPublishedDate +" Description: "+ p.description +"\n" +
        reduce(s="", c in contexts | s + substring(c, 0, size(c)-2) +"\n") as context
RETURN context Limit 5

"""

In [9]:
print(run_query(query))

                                             context
0  Package Name: A3 Last Published Date: 2015-08-...
1  Package Name: AalenJohansen Last Published Dat...
2  Package Name: AATtools Last Published Date: 20...
3  Package Name: ABACUS Last Published Date: 2019...
4  Package Name: abasequence Last Published Date:...


In [25]:
openai_api_key = api_keys

In [12]:
print(openai_api_key)
run_query("""
CALL apoc.periodic.iterate(
    'MATCH (p:Package) RETURN id(p) as id',
    'MATCH (p:Package)
    WHERE id(p) = id
    MATCH (p)-[r:CONTRIBUTED_TO|MAINTAINS]-(n:Person)
    WITH p, type(r) as type, collect(n.name) as names
    WITH p, type+": "+reduce(s="", n IN names | s + n + ", ") as types
    WITH p, collect(types) as contexts
    WITH p, "Package Name: "+ p.name + " Last Published Date: "+p.latestPublishedDate +" Description: "+ p.description +"\n" +
            reduce(s="", c in contexts | s + substring(c, 0, size(c)-2) +"\n") as context
    CALL apoc.ml.openai.embedding([context], $apiKey) YIELD embedding
    SET p.embedding = embedding',
    {batchSize:1, retries:1, params: {apiKey: $apiKey}})
""", {'apiKey': openai_api_key})['errorMessages'][0]
        
        

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


{'Server returned HTTP response code: 503 for URL: https://api.openai.com/v1/embeddings': 2,
 'Server returned HTTP response code: 502 for URL: https://api.openai.com/v1/embeddings': 1,
 'Server returned HTTP response code: 500 for URL: https://api.openai.com/v1/embeddings': 3}

In [13]:
system_prompt = """
You are an assistant that helps to generate text to form nice and human understandable answers based.
The latest prompt contains the information, and you need to generate a human readable response based on the given information.
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
Do not add any additional information that is not explicitly provided in the latest prompt.
I repeat, do not add any information that is not explicitly given.
"""

In [14]:
def generate_user_prompt(question, context):
    return f"""
    The question is {question}
    Answer the question by using the provided information:
    {context}
    """

In [18]:
def retrieve_context(question, k=3):
    data = run_query("""
        // retrieve the embedding of the question
        CALL apoc.ml.openai.embedding([$question], $apiKey) YIELD embedding 
        // match relevant packages
        MATCH (p:Package)
        WITH p, gds.similarity.cosine(embedding, p.embedding) AS score
        ORDER BY score DESC
        // LIMIT the number of relevant description
        LIMIT toInteger($k)
        // retrieve graph context 
        MATCH (p)--()--(p1:Package)
        WITH p, p1, count(*) AS count
        ORDER BY count DESC
        WITH p, apoc.text.join(collect(p1.package)[..3], ", ") as similarPackage
        MATCH (p)-[r:CONTRIBUTED_TO|MAINTAINS]-(n:Person)
        WITH p, similarPackage, type(r) as type, collect(n.name) as names
        WITH p, similarPackage, type+": "+reduce(s="", n IN names | s + n + ", ") as types
        WITH p, similarPackage, collect(types) as contexts
        WITH p, similarPackage, "Package Name: "+ p.name + " Last Published Date: "+p.latestPublishedDate +" Description: "+ p.description +"\n" +
                reduce(s="", c in contexts | s + substring(c, 0, size(c)-2) +"\n")  + "similar packages:" + similarPackage + "\n" as context
        RETURN context  
    """, {'question': question, 'k': k, 'apiKey': api_keys})
    return data['context'].to_list()

In [19]:
def generate_answer(question):
    # Retrieve context
    context = retrieve_context(question)
    # Print context
    for c in context:
        print(c)
    # Generate answer
    response = run_query(
        """
  CALL apoc.ml.openai.chat([{role:'system', content: $system},
                      {role: 'user', content: $user}], $apiKey) YIELD value
  RETURN value.choices[0].message.content AS answer
  """,
        {
            "system": system_prompt,
            "user": generate_user_prompt(question, context),
            "apiKey": api_keys,
        },
    )
    return response["answer"][0]

In [20]:
generate_answer("Who maintains dplyr package?")

Package Name: dplyr Last Published Date: 2023-09-03 Description: A fast, consistent tool for working with data frame like
    objects, both in memory and out of memory.
CONTRIBUTED_TO: Davis Vaughan, Hadley Wickham, Romain François, Lionel Henry, Kirill Müller
MAINTAINS: Hadley Wickham
similar packages:

Package Name: crplyr Last Published Date: 2023-03-21 Description: In order to facilitate analysis of datasets hosted on the Crunch
    data platform <https://crunch.io/>, the crplyr package implements dplyr
    methods on top of the Crunch backend. The usual methods select, filter,
    group_by, summarize, and collect are implemented in such a way as to
    perform as much computation on the server and pull as little data locally
    as possible.
MAINTAINS: Greg Freedman Ellis
CONTRIBUTED_TO: Neal Richardson, Mike Malecki, Greg Freedman Ellis, Aljaz Sluga, Jonathan Keane, Gordon Shotwell
similar packages:

Package Name: dtrackr Last Published Date: 2023-09-04 Description: Track and
   

'The dplyr package is maintained by Hadley Wickham.'

In [23]:
generate_answer("what packages are similar to sf package?")

Package Name: sf Last Published Date: 2023-07-11 Description: Support for simple features, a standardized way to
    encode spatial vector data. Binds to GDAL for reading and writing
    data, to GEOS for geometrical operations, and to PROJ for
    projection conversions and datum transformations. Uses by default the s2
    package for spherical geometry operations on ellipsoidal (long/lat) coordinates.
CONTRIBUTED_TO: Hadley Wickham, Ian Cook, Jeroen Ooms, Thomas Lin Pedersen, Tim Keitt, Etienne Racine, Robin Lovelace, Dewey Dunnington, Michael Sumner, Roger Bivand, Dan Baston, Edzer Pebesma, Kirill Müller
MAINTAINS: Edzer Pebesma
similar packages:

Package Name: mapsf Last Published Date: 2023-09-05 Description: Create and integrate thematic maps in your workflow. This package
    helps to design various cartographic representations such as proportional
    symbols, choropleth or typology maps. It also offers several functions to
    display layout elements that improve the graphic p

'Some packages that are similar to the `sf` package are:\n\n1. `mapsf` - This package helps to design various cartographic representations and integrates thematic maps into your workflow. It offers functions to display layout elements that improve the graphic presentation of maps. It maps `sf` objects on base graphics. This package is maintained by Timothée Giraud.\n\n2. `geospark` - This package extends the `sparklyr` package to make distributed geocomputing easier. It provides the same access to simple features like `sf`, but it runs on the Spark distributed system. This package is maintained by Harry Zhu.'

In [24]:
generate_answer("recommend packages for network science analysis?")

Package Name: sna Last Published Date: 2023-01-24 Description: A range of tools for social network analysis, including node and graph-level indices, structural distance and covariance methods, structural equivalence detection, network regression, random graph generation, and 2D/3D network visualization.
MAINTAINS: Carter T. Butts
CONTRIBUTED_TO: Carter T. Butts
similar packages:

Package Name: networktools Last Published Date: 2023-08-22 Description: Includes assorted tools for network analysis. Bridge centrality; goldbricker; MDS, PCA, & eigenmodel network plotting.
MAINTAINS: Payton Jones
CONTRIBUTED_TO: Payton Jones
similar packages:

Package Name: NetworkToolbox Last Published Date: 2021-05-28 Description: Implements network analysis and graph theory measures used in neuroscience, cognitive science, and psychology. Methods include various filtering methods and approaches such as threshold, dependency (Kenett, Tumminello, Madi, Gur-Gershgoren, Mantegna, & Ben-Jacob, 2010 <doi:10.137

'Based on the provided information, I would recommend the following packages for network science analysis:\n\n1. Package Name: sna\n   Last Published Date: 2023-01-24\n   Description: This package provides a range of tools for social network analysis. It includes node and graph-level indices, structural distance and covariance methods, structural equivalence detection, network regression, random graph generation, and 2D/3D network visualization.\n\n2. Package Name: networktools\n   Last Published Date: 2023-08-22\n   Description: This package includes assorted tools for network analysis. It offers features such as bridge centrality, goldbricker, and options for MDS, PCA, and eigenmodel network plotting.\n\n3. Package Name: NetworkToolbox\n   Last Published Date: 2021-05-28\n   Description: This package implements network analysis and graph theory measures used in neuroscience, cognitive science, and psychology. It provides various filtering methods and approaches, such as threshold and