In [1]:
%pip install -q langchain requests ollama langchain-ollama langchain-community aim streamlit

Note: you may need to restart the kernel to use updated packages.


# 1 Text Chunking

For the sake of model max token limits, we need to process the text in chunks.

In [19]:
# Text chunking

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

loader = TextLoader("./data/a-xmas-carol-body.txt")
documents = loader.load()

# Initialize the splitter
splitter = CharacterTextSplitter(
    separator ="\n\n",
    chunk_size=1024,
    chunk_overlap = 256,
)

chunks = splitter.split_documents(documents)

print(f'{len(chunks)} chunks created')

Created a chunk of size 1603, which is longer than the specified 1024
Created a chunk of size 1127, which is longer than the specified 1024
Created a chunk of size 1143, which is longer than the specified 1024
Created a chunk of size 1668, which is longer than the specified 1024
Created a chunk of size 1754, which is longer than the specified 1024
Created a chunk of size 1224, which is longer than the specified 1024
Created a chunk of size 1207, which is longer than the specified 1024
Created a chunk of size 1158, which is longer than the specified 1024
Created a chunk of size 1763, which is longer than the specified 1024
Created a chunk of size 1467, which is longer than the specified 1024
Created a chunk of size 1070, which is longer than the specified 1024
Created a chunk of size 1147, which is longer than the specified 1024
Created a chunk of size 1260, which is longer than the specified 1024
Created a chunk of size 1747, which is longer than the specified 1024


203 chunks created


# Relation Extraction

First, let's ask the LLM to provide us with a list of possible node / relation types.

In [5]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

node_types_json_schema = {
    "title": "entity types",
    "description": "a list of entity types",
    "type": "object",
    "properties": {
        "entity types": {
            "type": "string",
            "description": "A JSON-formatted list of strings"
        }
    },
    "required": ["entity types"]
}

llama3_1_node_type = ChatOllama(
    model="llama3.1:8b",
    temperature=0.8,
    top_k=60,
    top_p=0.9,
    num_predict=2048,
    base_url="http://127.0.0.1:11434",
)

llama3_1_node_type_structured = llama3_1_node_type.with_structured_output(node_types_json_schema)

entity_type_prompt_template = ChatPromptTemplate.from_messages([
        ("system", "You are to assist the user to construct a knowledge graph for a novel. "),
        ("human", """
The user is working on extracting entities from a book to build a knowledge graph. You goal is to provide a list of possible entity types that are commonly present in knowledge graphs for novels. Please provide a list of such entity types.
"""),
    ])

entity_type_chain = entity_type_prompt_template | llama3_1_node_type_structured

result = entity_type_chain.invoke(input={})

import json
node_types_list = json.loads(result["entity types"])
print(node_types_list)

['Person', 'Organization', 'Location', 'Event', 'Date']


## Relation Extraction by Chunk

Let's define our prompt and model for the extraction task.

In [20]:
import yaml
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

from utils.extract_util_v2 import convert_to_graph_documents, parse_response

# read prompt files from yaml
with open('./prompts/entity.yaml', 'r') as file:
    entity_yaml = yaml.safe_load(file)
    
from typing import Any, Dict, List, Optional, Sequence, Tuple, Type, Union, cast

from utils.extract_util_v2 import create_schema, parse_response

system_prompt = (
    "# Knowledge Graph Instructions for GPT-4\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph.\n"
    "Try to capture as much information from the text as possible without "
    "sacrificing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text.\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "accessible for a vast audience.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'."
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
)

default_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "human",
            (
                "Tips: "
                "1. Make sure to answer in the correct format and do "
                "not include any explanations. "
                "2. Do not include newline or special symbols such as \\\\ in your response that might cause python parsing errors. "
                "3. Stick to the given structured output schema. `Node` has `id` and `type` fields, and `Relationship` has `source`, `target`, and `type` fields. "
                "4. Extract information from the "
                "following input: {text}"
            ),
        ),
    ]
)

llama3_1 = ChatOllama(
    model="llama3.1:8b",
    temperature=0.2,
    top_k=10,
    top_p=0.6,
    num_predict=2048,
    base_url="http://127.0.0.1:11434",
)

llama3_1_node_structured = llama3_1.with_structured_output(create_schema(), include_raw=True)  

relation_extraction_chain = default_prompt | llama3_1_node_structured

A quick test to see if the model can extract entities from the first chunk.

In [7]:
result = relation_extraction_chain.invoke({
    "text": chunks[5].page_content,
})

parse_response(result)

([Node(id='human sympathy', type='concept', properties={}),
  Node(id='Scrooge', type='person', properties={})],
 [Relationship(source=Node(id='Scrooge', type='person', properties={}), target=Node(id='human sympathy', type='concept', properties={}), type='DISTANCE', properties={})])

Now, let's extract KGs from all the chunks.

In [21]:
graph_documents = convert_to_graph_documents(chunks, relation_extraction_chain)

Processing the documents...:   0%|          | 0/203 [00:00<?, ?it/s, Current chunk=CONTENTS

Processing the documents...:   0%|          | 1/203 [00:33<1:54:06, 33.89s/it, Current chunk=CONTENTS

Processing the documents...:  24%|██▍       | 49/203 [05:21<12:59,  5.06s/it, Current chunk=STAVE TWOh..., Iteration=49]
Processing the documents...:  25%|██▍       | 50/203 [05:28<14:07,  5.54s/it, Current chunk=STAVE TWO
Processing the documents...:  40%|████      | 82/203 [09:22<14:10,  7.03s/it, Current chunk=He seemed ..., Iteration=84]

Raw response message parsing error: 'tool_calls'


Processing the documents...: 100%|█████████▉| 202/203 [21:40<00:06,  6.44s/it, Current chunk=He had no ..., Iteration=203]


In [22]:
# cache our results
import pickle

with open('./output/graph_documents/graph_documents_refactored.pkl', 'wb') as f:
    pickle.dump(graph_documents, f)

# Entity Canonicalization / Disambiguation

Associate mentions of entities with an appropriate disambiguated KB identifier (id).

Combine relationships from different chunks and resolve conflicts across chunks.

In [68]:
from utils.extract_util_v2 import merge_graphs

merged_graph_document = merge_graphs(graph_documents=graph_documents,
             source_document=documents[0])

In [94]:
import pickle
with open('./output/graph_documents/merged_graph_document.pkl', 'wb') as f:
    pickle.dump(merged_graph_document, f)

In [100]:
# load the cached merged graph documents, just in case
import pickle
with open('./output/graph_documents/merged_graph_document.pkl', 'rb') as f:
    merged_graph_document = pickle.load(f)

# Langchain-Neo4j Exporter

In [95]:
from langchain_neo4j import Neo4jGraph
import dotenv, os

dotenv.load_dotenv()

URI = os.environ['NEO4J_URI']
USER = os.environ['NEO4J_USERNAME']
PASSWORD = os.environ['NEO4J_PASSWORD']

# initialize a neo4j-langchain graph
graph = Neo4jGraph(refresh_schema=False)

from utils.neo4j_util import Neo4jGraphImporter

neo4j_importer = Neo4jGraphImporter(
        uri=URI, 
        user=USER, 
        password='password',
    )

# clean the database
neo4j_importer.clear_database()  # Optional: Clear the database
neo4j_importer.drop_all_constraints() # Optional: Drop all constraints

Database cleared.
All constraints dropped.


In [99]:
# add the merged graph document to the neo4j graph
graph.add_graph_documents([merged_graph_document])