In [105]:
import os
from dotenv import load_dotenv
from langchain.text_splitter import NLTKTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.graphs import Neo4jGraph
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import PydanticOutputFunctionsParser
from langchain_community.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.chains import GraphCypherQAChain

import nltk
_ = nltk.download('punkt')

_ = load_dotenv(override=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cristhiansilvac./nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Document loader and prep

In [2]:
splitter = NLTKTextSplitter()

In [3]:
loader = PyPDFLoader("../A Brief History of Time: From the Big Bang to Black Holes.pdf")
data = loader.load()

chunks = splitter.split_documents(data)

In [4]:
len(chunks)

151

# Clean Chunks

In [49]:
import re

def clean_text(text:str):
    final_text = text
    final_text = final_text.lower()
    final_text = re.sub("[áä]","a", final_text)
    final_text = re.sub("[éë]","e", final_text)
    final_text = re.sub("[íï]","i", final_text)
    final_text = re.sub("[óö]","o", final_text)
    final_text = re.sub("[úü]","u", final_text)
    final_text = re.sub("\"","\'", final_text)
    final_text = re.sub("[^A-Za-z0-9\s\-\.\,\;\:]+","", final_text)

    return final_text

In [51]:
for chunk in chunks:
    chunk.page_content = clean_text(chunk.page_content)

# LLM

In [52]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo-0613",
    temperature=0
)

In [53]:
system_instructions = open("../system_instructions.txt","r").read()

prompt = ChatPromptTemplate.from_messages([
    ("system", system_instructions),
    ("human", "Use the given format to extract information from the following input: {input}"),
    ("human", "Tip: Make sure to answer in the correct format")
])

# KnowledgeGraph

In [54]:
from langchain.pydantic_v1 import Field, BaseModel
from typing import List, Optional
from langchain_community.graphs.graph_document import Node as BaseNode
from langchain_community.graphs.graph_document import Relationship as BaseRelationship
from langchain_community.graphs.graph_document import GraphDocument

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(None, description="List of relationship properties")

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")

In [55]:
parser = PydanticOutputFunctionsParser(pydantic_schema=KnowledgeGraph)

In [56]:
openai_functions = [convert_pydantic_to_openai_function(KnowledgeGraph)]

In [57]:
chain = LLMChain(
    llm=llm.bind(functions=openai_functions),
    prompt=prompt,
    output_parser=parser
)

In [None]:
results = []
for i,docs in enumerate(data):
    print("doc",i)
    try:
        results.append(chain.invoke(docs))
    except Exception as e:
        print("ERROR", e)

In [59]:
results

[{'input': Document(page_content="Chapter 1  - Our Picture of the Universe\nChapter 2  - Space and Time\nChapter 3  - The Expanding Universe\nChapter 4  - The Uncertainty Principle\nChapter 5  - Elementary Particles and the Forces of Nature\nChapter 6  - Black Holes\nChapter 7  - Black Holes Ain't So Black\nChapter 8  - The Origin and Fate of the Universe\nChapter 9  - The Arrow of Time\nChapter 10  - Wormholes and Time Travel\nChapter 11  - The Unification of Physics\nChapter 12  - Conclusion\nGlossary\nAcknowledgments & About The Author\n\xa0\xa0\nFOREWARDI didn’t write a foreword to the original edition of A Brief History of Time. That was done by Carl Sagan. Instead,\nI wrote a short piece titled “Acknowledgments” in which I was advised to thank everyone. Some of thefoundations that had given me support weren’t too pleased to have been mentioned, however, because it led toa great increase in applications.I don’t think anyone, my publishers, my agent, or myself, expected the book to

In [60]:
def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    if node.properties:
        properties = {prop.key: prop.value for prop in node.properties}
    else:
        properties = {
            "name": node.id.title()
        }

    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = {prop.key: prop.value for prop in rel.properties} if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [61]:
def create_graph_doc(res):
    graph_document = GraphDocument(
        nodes = [map_to_base_node(node) for node in res["text"].nodes],
        relationships = [map_to_base_relationship(rel) for rel in res["text"].rels],
        source = res["input"]
    )
    
    return graph_document

In [124]:
graph_documents = []
for res in results:
    graph_documents.append(create_graph_doc(res))

# Neo4J Driver

In [142]:
url = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")

graph = Neo4jGraph(url=url,
    username=username,
    password=password)

In [119]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

In [143]:
graph.add_graph_documents(graph_documents)

In [94]:
import json

def save_data(data, file):
    objs = []
    for d in data:
        obj_str = d.json()
        obj = json.loads(obj_str)
        objs.append(obj)
    with open(file, "w") as f:
        json.dump(objs, f)

def load_data(file):
    with open(file, "r") as f:
        data = json.load(f)
        data_res = []
        for d in data:
            data_res.append(GraphDocument.parse_obj(d))
            
        return data_res

In [95]:
save_data(graph_documents, "../data/history_of_time_graph.json")

In [97]:
graph_documents = load_data("../data/history_of_time_graph.json")

# RAG

In [144]:
cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=llm,
    qa_llm=llm,
    validate_cypher=True,
    verbose=True
)

In [145]:
cypher_chain.invoke({"query": "what did Newton discover?"})



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


{'query': 'what did Newton discover?',
 'result': 'Newton is best known for his discovery of the laws of motion and the law of universal gravitation. He formulated these laws in his book "Mathematical Principles of Natural Philosophy" published in 1687. Newton\'s laws of motion describe the relationship between the motion of an object and the forces acting upon it. The law of universal gravitation states that every particle of matter in the universe attracts every other particle with a force that is directly proportional to the product of their masses and inversely proportional to the square of the distance between them. These discoveries revolutionized our understanding of physics and laid the foundation for classical mechanics.'}