In [1]:
import os
import json
from dotenv import load_dotenv
from langchain.text_splitter import NLTKTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.graphs import Neo4jGraph
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import PydanticOutputFunctionsParser
from langchain_community.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.chains import GraphCypherQAChain

import nltk
_ = nltk.download('punkt')

_ = load_dotenv(override=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cristhiansilvac./nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Document loader and prep

In [2]:
splitter = NLTKTextSplitter(chunk_size=500)

In [3]:
loader = PyPDFLoader("../A Brief History of Time: From the Big Bang to Black Holes.pdf")
data = loader.load()

chunks = splitter.split_documents(data)

Created a chunk of size 586, which is longer than the specified 500
Created a chunk of size 625, which is longer than the specified 500
Created a chunk of size 539, which is longer than the specified 500
Created a chunk of size 511, which is longer than the specified 500
Created a chunk of size 817, which is longer than the specified 500
Created a chunk of size 581, which is longer than the specified 500
Created a chunk of size 599, which is longer than the specified 500
Created a chunk of size 668, which is longer than the specified 500
Created a chunk of size 530, which is longer than the specified 500
Created a chunk of size 892, which is longer than the specified 500
Created a chunk of size 504, which is longer than the specified 500
Created a chunk of size 546, which is longer than the specified 500
Created a chunk of size 504, which is longer than the specified 500
Created a chunk of size 514, which is longer than the specified 500
Created a chunk of size 541, which is longer tha

In [4]:
len(chunks)

1156

# Clean Chunks

In [5]:
import re

def clean_text(text:str):
    final_text = text
    final_text = final_text.lower()
    final_text = re.sub("[áä]","a", final_text)
    final_text = re.sub("[éë]","e", final_text)
    final_text = re.sub("[íï]","i", final_text)
    final_text = re.sub("[óö]","o", final_text)
    final_text = re.sub("[úü]","u", final_text)
    final_text = re.sub("\"","\'", final_text)
    final_text = re.sub("[^A-Za-z0-9\s\-\.\,\;\:]+","", final_text)

    return final_text

In [6]:
for chunk in chunks:
    chunk.page_content = clean_text(chunk.page_content)

# LLM

In [7]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo-0613",
    temperature=0
)

In [8]:
system_instructions = open("../system_instructions.txt","r").read()

prompt = ChatPromptTemplate.from_messages([
    ("system", system_instructions),
    ("human", "Use the given format to extract information from the following input: {input}"),
    ("human", "Tip: Make sure to answer in the correct format")
])

# KnowledgeGraph

In [9]:
from langchain.pydantic_v1 import Field, BaseModel
from typing import List, Optional
from langchain_community.graphs.graph_document import Node as BaseNode
from langchain_community.graphs.graph_document import Relationship as BaseRelationship
from langchain_community.graphs.graph_document import GraphDocument

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(None, description="List of relationship properties")

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")

In [10]:
parser = PydanticOutputFunctionsParser(pydantic_schema=KnowledgeGraph)

In [12]:
openai_functions = [convert_pydantic_to_openai_function(KnowledgeGraph)]

In [13]:
chain = LLMChain(
    llm=llm.bind(functions=openai_functions),
    prompt=prompt,
    output_parser=parser
)

## (Op1) Processing Docs Splited

In [None]:
results = []
for i,docs in enumerate(data):
    print("doc",i)
    try:
        results.append(chain.invoke(docs))
    except Exception as e:
        print("ERROR", e)

In [None]:
results

In [None]:
def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    if node.properties:
        properties = {prop.key: prop.value for prop in node.properties}
    else:
        properties = {
            "name": node.id.title()
        }

    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = {prop.key: prop.value for prop in rel.properties} if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [None]:
def create_graph_doc(res):
    graph_document = GraphDocument(
        nodes = [map_to_base_node(node) for node in res["text"].nodes],
        relationships = [map_to_base_relationship(rel) for rel in res["text"].rels],
        source = res["input"]
    )
    
    return graph_document

In [None]:
graph_documents = []
for res in results:
    graph_documents.append(create_graph_doc(res))

#### To save backup

In [None]:
if False:
    def save_data(data, file):
        objs = []
        for d in data:
            obj_str = d.json()
            obj = json.loads(obj_str)
            objs.append(obj)
        with open(file, "w") as f:
            json.dump(objs, f)
    save_data(graph_documents, "../data/history_of_time_graph_3.json")

## (Op2) Use Data backup

In [14]:
def load_data(file):
    with open(file, "r") as f:
        data = json.load(f)
        data_res = []
        for d in data:
            data_res.append(GraphDocument.parse_obj(d))
            
        return data_res

In [15]:
graph_documents = load_data("../data/history_of_time_graph_3.json")

# Neo4J Driver

In [17]:
url = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")

graph = Neo4jGraph(url=url,
    username=username,
    password=password)

In [18]:
# Delete the current graph
graph.query("MATCH (n) DETACH DELETE n")

[]

### Add Docs to Knowledge Graph

In [19]:
error_indexes = [4]
arr = [x for x in range(len(graph_documents)) if x not in error_indexes]

for x in arr:
    try:
        print(x)
        graph.add_graph_documents([graph_documents[x]], include_source=True)
        graph.refresh_schema()

    except Exception as e:
        print("error")


0
1
2
3
5
6


In [20]:
print(graph.structured_schema)

{'node_props': {'Document': [{'property': 'text', 'type': 'STRING'}, {'property': 'source', 'type': 'STRING'}, {'property': 'page', 'type': 'INTEGER'}], 'Chapter': [{'property': 'title', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}], 'Section': [{'property': 'id', 'type': 'STRING'}, {'property': 'title', 'type': 'STRING'}], 'Person': [{'property': 'name', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}, {'property': 'description', 'type': 'STRING'}, {'property': 'deathDate', 'type': 'STRING'}, {'property': 'birthDate', 'type': 'STRING'}], 'Book': [{'property': 'title', 'type': 'STRING'}, {'property': 'id', 'type': 'STRING'}, {'property': 'author', 'type': 'STRING'}, {'property': 'publicationYear', 'type': 'STRING'}], 'Satellite': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}, {'property': 'fullForm', 'type': 'STRING'}], 'Celestial body': [{'property': 'id', 'type': 'STRING'}, {'property': 'name', 'type': 'STRING'}], 'Concept': [{'prope

# RAG

In [34]:
llm_gpt4 = ChatOpenAI(
    model="gpt-4-turbo-preview",
    temperature=0
)

In [35]:
cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=llm,#llm_gpt4,
    qa_llm=llm,
    validate_cypher=True,
    verbose=True,
    top_k=1,
    return_intermediate_steps=True
)

In [36]:
cypher_chain.invoke({"query": "tell me everything Immanuel Kant did?"})



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Person {name: "Immanuel Kant"})-[:AUTHORED|PROPOSED_MODEL|OBJECTION|ARGUED_FOR|INFLUENCEDBY|OPPOSED_TO|EXAMINED|LAWSOFMOTION|BASISOF|LAWSOFGRAVITY|AUTHOROF]->(related)
RETURN p, related
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


{'query': 'tell me everything Immanuel Kant did?',
 'result': "I'm sorry, but I don't have any information on what Immanuel Kant did.",
 'intermediate_steps': [{'query': 'cypher\nMATCH (p:Person {name: "Immanuel Kant"})-[:AUTHORED|PROPOSED_MODEL|OBJECTION|ARGUED_FOR|INFLUENCEDBY|OPPOSED_TO|EXAMINED|LAWSOFMOTION|BASISOF|LAWSOFGRAVITY|AUTHOROF]->(related)\nRETURN p, related\n'},
  {'context': []}]}

In [39]:
cypher_chain.invoke({"query": "who has connections with Aritotle? (Aristotle is an id)"})



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Person)-[r]-(x) WHERE p.id = 'Aristotle' RETURN p, type(r), x
[0m
Full Context:
[32;1m[1;3m[{'p': {'description': 'Greek philosopher who believed in an eternal universe', 'id': 'Aristotle'}, 'type(r)': 'INFLUENCEDBY', 'x': {'deathDate': '1642', 'id': 'Galileo', 'birthDate': '1564'}}][0m

[1m> Finished chain.[0m


{'query': 'who has connections with Aritotle? (Aristotle is an id)',
 'result': 'Aristotle was influenced by a Greek philosopher who believed in an eternal universe.',
 'intermediate_steps': [{'query': "cypher\nMATCH (p:Person)-[r]-(x) WHERE p.id = 'Aristotle' RETURN p, type(r), x\n"},
  {'context': [{'p': {'description': 'Greek philosopher who believed in an eternal universe',
      'id': 'Aristotle'},
     'type(r)': 'INFLUENCEDBY',
     'x': {'deathDate': '1642', 'id': 'Galileo', 'birthDate': '1564'}}]}]}