In [1]:
import json 
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

## Loading openai key
with open('../environ.json') as f:
    config = json.load(f)
    api_keys = list(config.values())[0]

# assign openai key to an environment variable
os.environ["OPENAI_API_KEY"] = api_keys

# # check to confirm
# os.getenv("OPENAI_API_KEY")


In [3]:
# establish connection to neo4j graph database
graph = Neo4jGraph(
    url="bolt://localhost:7687", username="admin", password="cran2graph", database="neo4j"
)

In [4]:
# check the graph schema
print(graph.schema)


        Node properties are the following:
        [{'labels': 'Person', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'Package', 'properties': [{'property': 'latestPublishedDate', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'md5sum', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'license', 'type': 'STRING'}, {'property': 'version', 'type': 'STRING'}]}, {'labels': 'Organization', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'License', 'properties': [{'property': 'type', 'type': 'STRING'}]}, {'labels': 'Institution', 'properties': [{'property': 'type', 'type': 'STRING'}]}]
        Relationship properties are the following:
        []
        The relationships are the following:
        ['(:Person)-[:WORKS_AT]->(:Institution)', '(:Person)-[:CONTRIBUTED_TO]->(:Package)', '(:Person)-[:MAINTAINS]->(:Package)', '(:Package)-[:LICENSE_BY]->(:License)', '(:Package)-[:SUPPORTED_BY]->(:Organ

In [5]:
# chain rule
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=graph, verbose=True
)

In [8]:
chain.run("list all packages Kirill Müller contributed to")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: 'Kirill Müller'})-[:CONTRIBUTED_TO]->(pkg:Package)
RETURN pkg.name[0m
Full Context:
[32;1m[1;3m[{'pkg.name': 'RSQLite'}, {'pkg.name': 'here'}, {'pkg.name': 'RMariaDB'}, {'pkg.name': 'DBI'}, {'pkg.name': 'chk'}, {'pkg.name': 'roxygen2md'}, {'pkg.name': 'mlfit'}, {'pkg.name': 'StrucDiv'}, {'pkg.name': 'term'}, {'pkg.name': 'dm'}][0m

[1m> Finished chain.[0m


'Kirill Müller has contributed to the following packages: RSQLite, here, RMariaDB, DBI, chk, roxygen2md, mlfit, StrucDiv, term, and dm.'

In [41]:
chain.run("how many packages does Jing Zhang maintains and the names and when it was published?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {person: 'Jing Zhang'})-[:MAINTAINS]->(pkg:Package)
RETURN COUNT(pkg) AS packageCount, pkg.package AS packageName, pkg.published AS publishedDate[0m
Full Context:
[32;1m[1;3m[{'packageCount': 1, 'packageName': 'do', 'publishedDate': '2021-08-03'}, {'packageCount': 1, 'packageName': 'nomogramFormula', 'publishedDate': '2020-01-28'}][0m

[1m> Finished chain.[0m


'Jing Zhang maintains two packages. The first package is called "do" and it was published on August 3, 2021. The second package is called "nomogramFormula" and it was published on January 28, 2020.'

In [42]:
chain.run("Name of package Dennis Irorere contributed to")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:CONTRIBUTED_TO]->(pkg:Package)
WHERE p.person = 'Dennis Irorere'
RETURN pkg.package[0m
Full Context:
[32;1m[1;3m[{'pkg.package': 'anyflights'}][0m

[1m> Finished chain.[0m


'The name of the package that Dennis Irorere contributed to is "anyflights".'

In [43]:
chain.run("how many packages does Kirill Muller contributes. I might have spelt name wrong")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {person: 'Kirill Muller'})-[:CONTRIBUTED_TO]->(pkg:Package)
RETURN COUNT(pkg)[0m
Full Context:
[32;1m[1;3m[{'COUNT(pkg)': 2}][0m

[1m> Finished chain.[0m


'Kirill Muller contributes 2 packages.'

In [10]:
chain.run("what package will you recommend for osm data processing using the description ")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Package)
WHERE p.description CONTAINS 'osm data processing'
RETURN p.name[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I'm sorry, but without any provided information, I don't have a specific package to recommend for OSM data processing. However, there are several popular packages available for this purpose, such as Osmosis, osmium-tool, and GDAL. These packages offer various functionalities for working with OSM data, including data extraction, filtering, and conversion. It would be helpful to consider your specific requirements and the programming language you are using to determine the most suitable package for your needs."

In [11]:
chain.run("what dependency has the most package depending on it and the maintainer")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p1:Package)-[:DEPENDS_ON]->(p2:Package)
WITH p2, COUNT(p1) AS dependencyCount
ORDER BY dependencyCount DESC
LIMIT 1
MATCH (p2)<-[:MAINTAINS]-(m:Person)
RETURN p2.name AS dependency, m.name AS maintainer[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I'm sorry, but I don't have access to the information about the dependency with the most packages depending on it and its maintainer."

In [46]:
chain.run("using description properties recommend an alternative package for dplyr")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p1:Package)-[:DEPENDS_ON]->(d:Dependency)<-[:DEPENDS_ON]-(p2:Package)
WHERE p1.package = 'dplyr'
RETURN p2.package, p2.description[0m
Full Context:
[32;1m[1;3m[{'p2.package': 'dials', 'p2.description': 'Many models contain tuning parameters (i.e. parameters that\n    cannot be directly estimated from the data). These tools can be used\n    to define objects for creating, simulating, or validating values for\n    such parameters.'}, {'p2.package': 'googledrive', 'p2.description': 'Manage Google Drive files from R.'}, {'p2.package': 'retroharmonize', 'p2.description': 'Assist in reproducible retrospective (ex-post) harmonization\n    of data, particularly individual level survey data, by providing tools\n    for organizing metadata, standardizing the coding of variables, and\n    variable names and value labels, including missing values, and\n    documenting the data transformations, with the he

'Based on the description properties, an alternative package for dplyr could be "dbplyr". It is a dplyr back end for databases that allows you to work with remote database tables as if they are in-memory data frames. While basic features work with any database that has a DBI back end, more advanced features require SQL translation to be provided by the package author.'