In [5]:
from dotenv import load_dotenv
load_dotenv()

True

# Diffbot

Maximum of 100,000 characters per document and 1,000,000 total characters per API request.

up to 10,000 char per 1 credit (consumed in blocks, will usually be less than 10,000 chars per credit)

free trial of 10,000 credits
= 100,000,000 chars

In [6]:
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
import os

diffbot_api_key = os.environ.get('DIFFBOT_API_KEY')
diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)

In [7]:
from langchain.document_loaders import WikipediaLoader

query = "Warren Buffett"
raw_documents = WikipediaLoader(query=query).load()
graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)

In [18]:
# print(graph_documents[-1].nodes[0])
# print(graph_documents[-1].relationships[0])
print(graph_documents[-1].source.metadata)

{'title': 'Howard Buffett', 'summary': 'Howard Homan Buffett (August 13, 1903 – April 30, 1964) was an American businessman, investor, and politician. He was a four-term Republican United States Representative for the state of Nebraska. He was the father of Warren Buffett, the American billionaire businessman and investor.\n\n', 'source': 'https://en.wikipedia.org/wiki/Howard_Buffett'}


In [9]:
graph_documents[0].__dict__.keys()

dict_keys(['nodes', 'relationships', 'source'])

In [1]:
len(graph_documents)

NameError: name 'graph_documents' is not defined

In [1]:
from langchain.graphs import Neo4jGraph

url="bolt://localhost:7687"
username="neo4j"
password="ofcounsel"

graph = Neo4jGraph(
    url=url,
    username=username, 
    password=password
)



In [2]:
graph.add_graph_documents(graph_documents)

NameError: name 'graph_documents' is not defined

In [3]:
from langchain.chains import GraphCypherQAChain
from langchain.chat_models import ChatOpenAI

chain = GraphCypherQAChain.from_llm( # will want to fork this probably
    cypher_llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
    qa_llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),
    graph=graph, verbose=True,
    
)

ValidationError: 1 validation error for ChatOpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)

In [29]:
chain.run("Which university did Warren Buffett attend?")





[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:ATTENDED]->(u:University)
WHERE p.name = 'Warren Buffett'
RETURN u.name[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'Warren Buffett attended the University of Pennsylvania.'

## Observations

- Incorrect answer: Warren Buffett actually attended the University of Nebraska (maybe it'd do better with gpt-4?)
- The generated cypher query references the relationship type ATTENDED, but it's actually EDUCATED_AT
  - does the LLM not have access to the available types?
- As a result, no context was returned, and thus the LLM hallucinated a response

## Ideas for improvements
- Instead, the agent should keep trying queries until something is returned, or if max attempts, fail.
  - could look for matching node types first, then show all relationships connected to that node, or vice-versa
- Fail if no context
  - ideally would then look up via other sources to fill in the knowledge graph
- store text chunks directly in neo4j as nodes with their embeddings, and link as evidence to instantiated nodes
- can have continuously running agents navigating the graph and correcting for errors (e.g. duplicate nodes, incorrect relationships, stale data, etc.)
  - corrections can be sent to humans for approval
- can have agents that continuously update the graph with new information that seems relevant to recently added nodes
- create embeddings for all node and relationship names, and use semantic similarity to assist in cypher query construction
- at some point will need to evaluate the maximum size 


