In [1]:
import json 
import os
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

## Loading openai key
with open('../environ.json') as f:
    config = json.load(f)
    api_keys = list(config.values())[0]

# assign openai key to an environment variable
os.environ["OPENAI_API_KEY"] = api_keys

# # check to confirm
# os.getenv("OPENAI_API_KEY")


In [3]:
# establish connection to neo4j graph database
graph = Neo4jGraph(
    url="bolt://localhost:7689", 
    username="admin", 
    password="cran2graph", 
    database="neo4j"
)
print(graph.schema)


        Node properties are the following:
        [{'labels': 'Person', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'Package', 'properties': [{'property': 'name', 'type': 'STRING'}, {'property': 'latestPublishedDate', 'type': 'STRING'}, {'property': 'md5sum', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'license', 'type': 'STRING'}, {'property': 'version', 'type': 'STRING'}]}, {'labels': 'Organization', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'License', 'properties': [{'property': 'type', 'type': 'STRING'}]}, {'labels': 'Institution', 'properties': [{'property': 'type', 'type': 'STRING'}]}]
        Relationship properties are the following:
        []
        The relationships are the following:
        ['(:Person)-[:CONTRIBUTED_TO]->(:Package)', '(:Person)-[:WORKS_AT]->(:Institution)', '(:Person)-[:MAINTAINS]->(:Package)', '(:Package)-[:DEPENDS_ON]->(:Package)', '(:Package)-[:LICENSE_BY]->(:License

In [4]:
# check the graph schema
print(graph.schema)


        Node properties are the following:
        [{'labels': 'Person', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'Package', 'properties': [{'property': 'name', 'type': 'STRING'}, {'property': 'latestPublishedDate', 'type': 'STRING'}, {'property': 'md5sum', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'license', 'type': 'STRING'}, {'property': 'version', 'type': 'STRING'}]}, {'labels': 'Organization', 'properties': [{'property': 'name', 'type': 'STRING'}]}, {'labels': 'License', 'properties': [{'property': 'type', 'type': 'STRING'}]}, {'labels': 'Institution', 'properties': [{'property': 'type', 'type': 'STRING'}]}]
        Relationship properties are the following:
        []
        The relationships are the following:
        ['(:Person)-[:CONTRIBUTED_TO]->(:Package)', '(:Person)-[:WORKS_AT]->(:Institution)', '(:Person)-[:MAINTAINS]->(:Package)', '(:Package)-[:DEPENDS_ON]->(:Package)', '(:Package)-[:LICENSE_BY]->(:License

In [5]:
# chain rule
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=graph, verbose=True
)

In [9]:
chain.run("list all packages Kirill Müller contributed to")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (:Person{name:'Kirill Müller'})-[:CONTRIBUTED_TO]->(p:Package)
RETURN p;[0m
Full Context:
[32;1m[1;3m[{'p': {'license': 'GPL-2', 'latestPublishedDate': '2021-12-06', 'md5sum': 'b373f83a971b68402825e699fce7cb4d', 'name': 'sessioninfo', 'description': 'Query and print information about the current R\n    session.  It is similar to utils::sessionInfo(), but includes more\n    information about packages, and where they were installed from.', 'version': '1.2.2'}}, {'p': {'license': 'MIT + file LICENSE', 'latestPublishedDate': '2023-03-20', 'md5sum': 'e4ccfbe28c84b11f86388379c8b43925', 'name': 'tibble', 'description': 'Provides a tbl_df class (the tibble) with stricter checking and better formatting than the traditional\n    data frame.', 'version': '3.2.1'}}, {'p': {'license': 'MIT + file LICENSE', 'latestPublishedDate': '2018-03-13', 'md5sum': 'cfa02c563196a79bf8bb4db2e66585fd', 'name': 'bindr', 'd

'Kirill Müller has contributed to the following packages:\n- sessioninfo\n- skimr'

In [10]:
chain.run("How many packages is maintained by Kirill Müller?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (:Person{name:'Kirill Müller'})-[:MAINTAINS]->(p:Package)
RETURN COUNT(p)[0m
Full Context:
[32;1m[1;3m[{'COUNT(p)': 28}][0m

[1m> Finished chain.[0m


'Kirill Müller maintains 28 packages.'

In [9]:
chain.run("how many packages does Jing Zhang maintains and the names and when it was published?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: 'Jing Zhang'})-[:MAINTAINS]->(pkg:Package)
RETURN COUNT(pkg) AS packageCount, pkg.name AS packageName, pkg.latestPublishedDate AS publishedDate[0m
Full Context:
[32;1m[1;3m[{'packageCount': 1, 'packageName': 'ggrisk', 'publishedDate': '2021-08-09'}, {'packageCount': 1, 'packageName': 'do', 'publishedDate': '2021-08-03'}, {'packageCount': 1, 'packageName': 'msig', 'publishedDate': '2021-06-29'}, {'packageCount': 1, 'packageName': 'base.rms', 'publishedDate': '2020-08-01'}, {'packageCount': 1, 'packageName': 'cutoff', 'publishedDate': '2019-12-20'}, {'packageCount': 1, 'packageName': 'nomogramFormula', 'publishedDate': '2020-01-28'}][0m

[1m> Finished chain.[0m


"Jing Zhang maintains a total of 6 packages. The names of the packages are 'ggrisk', 'do', 'msig', 'base.rms', 'cutoff', and 'nomogramFormula'. Here are the published dates for each package: 'ggrisk' was published on August 9, 2021, 'do' was published on August 3, 2021, 'msig' was published on June 29, 2021, 'base.rms' was published on August 1, 2020, 'cutoff' was published on December 20, 2019, and 'nomogramFormula' was published on January 28, 2020."

In [11]:
chain.run("Name of package Dennis Irorere contributed to")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (:Person {name: 'Dennis Irorere'})-[:CONTRIBUTED_TO]->(p:Package)
RETURN p.name[0m
Full Context:
[32;1m[1;3m[{'p.name': 'anyflights'}][0m

[1m> Finished chain.[0m


'Dennis Irorere contributed to the package named "anyflights."'

In [14]:
chain.run("how many packages does Kirill Müller contributes and list the packages. I might have spelt name wrong")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: 'Kirill Müller'})-[:CONTRIBUTED_TO]->(pkg:Package)
RETURN COUNT(pkg) AS numberOfPackages, COLLECT(pkg.name) AS packageList[0m
Full Context:
[32;1m[1;3m[{'numberOfPackages': 54, 'packageList': ['RSQLite', 'here', 'RMariaDB', 'DBI', 'chk', 'roxygen2md', 'mlfit', 'StrucDiv', 'term', 'dm', 'DBItest', 'log4r', 'hms', 'skimr', 'bindrcpp', 'desc', 'indiedown', 'enc', 'knitrProgressBar', 'fledge', 'cli', 'tibble', 'universals', 'tibblify', 'styler', 'sessioninfo', 'tikzDevice', 'pillar', 'duckplyr', 'winch', 'withr', 'memoise', 'wrswoR', 'lest', 'drake', 'MakefileR', 'optparse', 'procmaps', 'traudem', 'utf8', 'dplyr', 'kimisc', 'extras', 'plogr', 'covr', 'profile', 'bindr', 'RPostgres', 'whereami', 'blob', 'sf', 'mockr', 'igraph', 'rprojroot']}][0m

[1m> Finished chain.[0m


'Kirill Müller contributes to a total of 54 packages. Some of the packages he contributes to include RSQLite, here, RMariaDB, DBI, chk, roxygen2md, mlfit, StrucDiv, term, dm, DBItest, log4r, hms, skimr, bindrcpp, desc, indiedown, enc, knitrProgressBar, fledge, cli, tibble, universals, tibblify, styler, sessioninfo, tikzDevice, pillar, duckplyr, winch, withr, memoise, wrswoR, lest, drake, MakefileR, optparse, procmaps, traudem, utf8, dplyr, kimisc, extras, plogr, covr, profile, bindr, RPostgres, whereami, blob, sf, mockr, igraph, and rprojroot.'

In [10]:
chain.run("what package will you recommend for osm data processing using the description ")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Package)
WHERE p.description CONTAINS 'osm data processing'
RETURN p.name[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I'm sorry, but without any provided information, I don't have a specific package to recommend for OSM data processing. However, there are several popular packages available for this purpose, such as Osmosis, osmium-tool, and GDAL. These packages offer various functionalities for working with OSM data, including data extraction, filtering, and conversion. It would be helpful to consider your specific requirements and the programming language you are using to determine the most suitable package for your needs."

In [13]:
chain.run("what dependency has the most package depending on it and the maintainer")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Package)-[:DEPENDS_ON]->(d:Package)
WITH d, COUNT(p) AS dependencyCount
ORDER BY dependencyCount DESC
LIMIT 1
MATCH (d)<-[:MAINTAINS]-(m:Person)
RETURN d.name AS dependency, m.name AS maintainer[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I'm sorry, but I don't have access to the information about the dependency with the most packages depending on it and its maintainer."

In [46]:
chain.run("using description properties recommend an alternative package for dplyr")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p1:Package)-[:DEPENDS_ON]->(d:Dependency)<-[:DEPENDS_ON]-(p2:Package)
WHERE p1.package = 'dplyr'
RETURN p2.package, p2.description[0m
Full Context:
[32;1m[1;3m[{'p2.package': 'dials', 'p2.description': 'Many models contain tuning parameters (i.e. parameters that\n    cannot be directly estimated from the data). These tools can be used\n    to define objects for creating, simulating, or validating values for\n    such parameters.'}, {'p2.package': 'googledrive', 'p2.description': 'Manage Google Drive files from R.'}, {'p2.package': 'retroharmonize', 'p2.description': 'Assist in reproducible retrospective (ex-post) harmonization\n    of data, particularly individual level survey data, by providing tools\n    for organizing metadata, standardizing the coding of variables, and\n    variable names and value labels, including missing values, and\n    documenting the data transformations, with the he

'Based on the description properties, an alternative package for dplyr could be "dbplyr". It is a dplyr back end for databases that allows you to work with remote database tables as if they are in-memory data frames. While basic features work with any database that has a DBI back end, more advanced features require SQL translation to be provided by the package author.'