In [1]:
import json
from pathlib import Path
from typing import List

from langchain_community.document_loaders import TextLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import TokenTextSplitter, RecursiveCharacterTextSplitter
from neo4j.exceptions import ClientError

## Init our tools - LLM and Graph database

In [2]:
graph = Neo4jGraph(database="jeopardy") # unforturnately that specifying db does not work
db_name = "jeopardy"

# Embeddings & LLM models
embeddings = OpenAIEmbeddings()
embedding_dimension = 1536
llm = ChatOpenAI(model="gpt-4o-2024-08-06", temperature=0)

## Get the text from PDF file

In [4]:
import re
from langchain_core.documents.base import Document

def docs_to_json_pretty(docs: List[Document]):
    return json.dumps([doc.dict() for doc in docs], indent=4)

loader = TextLoader(str("extracted_text.txt"), encoding="utf-8")
documents = loader.load()
# print(documents[0].page_content)


In [5]:
# Ingest Parent-Child node pairs
parent_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
child_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24)
parent_documents = parent_splitter.split_documents(documents)
child_documents = child_splitter.split_documents([parent_documents[0]])
# print(len(docs_to_json_pretty(child_documents)), docs_to_json_pretty(child_documents))

## Ingest Document parents and children into knowledge graph

In [9]:
for i, parent in enumerate(parent_documents):
    child_documents = child_splitter.split_documents([parent])
    params = {
        "parent_text": parent.page_content,
        "parent_id": i,
        "parent_embedding": embeddings.embed_query(parent.page_content),
        "children": [
            {
                "text": c.page_content,
                "id": f"{i}-{ic}",
                "embedding": embeddings.embed_query(c.page_content),
            }
            for ic, c in enumerate(child_documents)
        ],
    }
    # Ingest data
    graph.query(
    f"USE {db_name} "
    """
    MERGE (p:Parent {id: $parent_id})
    SET p.text = $parent_text
    WITH p
    CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding)
    YIELD node
    WITH p 
    UNWIND $children AS child
    MERGE (c:Child {id: child.id})
    SET c.text = child.text
    MERGE (c)<-[:HAS_CHILD]-(p)
    WITH c, child
    CALL db.create.setVectorProperty(c, 'embedding', child.embedding)
    YIELD node
    RETURN count(*)
    """,
        params,
    )
    # Create vector index for child
    graph.query(
        f"USE {db_name} "
        "CALL db.index.vector.createNodeIndex('parent_document', "
        "'Child', 'embedding', $dimension, 'cosine')",
        {"dimension": embedding_dimension},
    )
    # Create vector index for parents
    try:
        graph.query(
            f"USE {db_name} "
            "CALL db.index.vector.createNodeIndex('typical_rag', "
            "'Parent', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.index.vector.createNodeIndex`: Caused by: org.neo4j.kernel.api.exceptions.schema.EquivalentSchemaRuleAlreadyExistsException: An equivalent index already exists, 'Index( id=3, name='parent_document', type='VECTOR', schema=(:Child {embedding}), indexProvider='vector-2.0' )'.}

## Prompt Plumbing for Jeopardy Questions with Points and Answers in Categories

In [11]:
class JeopardyQuestion(BaseModel):
    """Structuring questions with categories, points, and answers."""
    category: str = Field(..., description="Jeopardy-style category")
    question: str = Field(..., description="Generated question")
    answer: str = Field(..., description="Generated answer")
    points: int = Field(..., description="Point value associated with the question")

class JeopardyQuestions(BaseModel):
    """Generating hypothetical Jeopardy-style questions with answers."""
    questions: List[JeopardyQuestion] = Field(
        ..., description="List of questions with their categories, answers, and points"
    )

questions_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                "You are generating Jeopardy-style questions with points and answers in specific categories "
                "based on the information found in the text. "
                "Each question should have only one correct answer. "
                "Each question should have a point value ranging from 100 to 500 depending on the difficulty of the question. "
            ),
        ),
        (
            "human",
            (
                "Use the given format to generate Jeopardy-style questions with difficulty point value and answers in corresponding category from the "
                "following input: {input}"
            ),
        ),
    ]
)

question_chain = questions_prompt | llm.with_structured_output(JeopardyQuestions)



In [17]:
graph.query("use jeopardy match (n:Question|Answer|Category) detach delete n")

[]

## Generate and Ingest Questions with Points and Answers for Categories

In [None]:
for parent_id, parent in enumerate(parent_documents):
    jeopardy_data = question_chain.invoke(parent.page_content)
    questions = jeopardy_data.questions
    print(docs_to_json_pretty(questions))

    params = {
        "parent_id": parent_id,
        "questions": [
            {
                "question_id": f"q-{question_id}",
                "text": q.question,
                "category": q.category,
                "points": q.points,
                # "question_embedding": embeddings.embed_query(q.question),
                "answer_id": f"a-{question_id}",
                "answer_text": q.answer,
                # "answer_embedding": embeddings.embed_query(q.answer)
            }
            for question_id, q in enumerate(questions)
        ],
    }
print(len(params))

In [21]:
for parent_id, parent in enumerate(parent_documents):
    jeopardy_data = question_chain.invoke(parent.page_content)
    questions = jeopardy_data.questions
    print(docs_to_json_pretty(questions))

    params = {
        "parent_id": parent_id,
        "questions": [
            {
                "question_id": f"q-{question_id}",
                "text": q.question,
                "category": q.category,
                "points": q.points,
                "question_embedding": embeddings.embed_query(q.question),
                "answer_id": f"a-{question_id}",
                "answer_text": q.answer,
                "answer_embedding": embeddings.embed_query(q.answer)
            }
            for question_id, q in enumerate(questions)
        ],
    }

    # create question nodes
    graph.query(
        f"USE {db_name} "
        """
        UNWIND $questions AS question
        MATCH (p:Parent {id: $parent_id})
        WITH p, question
        MERGE (cat:Category {id: question.category, name: question.category})
        WITH p, cat, question
        CREATE (q:Question {id: question.question_id})
        CREATE (a:Answer {id: question.answer_id})
        SET q.text = question.text, q.points = question.points
        SET a.text = question.answer_text
        MERGE (cat)<-[:IN_CATEGORY]-(q)-[:FOR_CONTENT]->(p)
        MERGE (q)-[:HAS_ANSWER]->(a)

        WITH q, question

        CALL db.create.setVectorProperty(q, 'embedding', question.question_embedding) 
        YIELD node
        RETURN count(*)
    """,
        params,
    )

    # Ingest generated answers
    graph.query(
        f"USE {db_name} "
        """
        UNWIND $questions AS question
        MATCH (a:Question {id: question.answer_id})
        WITH a, question

        CALL db.create.setVectorProperty(a, 'embedding', question.answer_embedding) 
        YIELD node
        RETURN count(*)
        """,
        params,
    )
    
    # Create vector indexes for questions
    try:
        graph.query(
            f"USE {db_name} "
            "CALL db.index.vector.createNodeIndex('jeopardy_questions', "
            "'Question', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

    # Create vector indexes for answers
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('jeopardy_answers', "
            "'Answer', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

[
    {
        "category": "DINNER APPETIZERS",
        "question": "What appetizer consists of fried balls of risotto and tender beef short rib served with tomato basil sauce?",
        "answer": "Short Rib Arancini",
        "points": 100
    },
    {
        "category": "DINNER APPETIZERS",
        "question": "Which appetizer is a soft freshly made mozzarella ribbons in rich cream topped with lemon zest, olive oil, and pistachios, accompanied by soppresatta, oven-dried tomatoes, and giardiniera?",
        "answer": "Fresh Stracciatella Cheese & Antipasto Plate",
        "points": 200
    },
    {
        "category": "DINNER APPETIZERS",
        "question": "What is the price of the Baked Wagyu-Pork Meatball appetizer?",
        "answer": "$15",
        "points": 300
    },
    {
        "category": "DINNER APPETIZERS",
        "question": "Which appetizer is served with grilled ciabatta and topped with a warm tomato basil sauce?",
        "answer": "Goat Cheese & Mascarpone Dip",


ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.create.setVectorProperty`: Caused by: java.lang.NullPointerException: 'vector' must not be null}

In [19]:
cleaned_dict = {k: v for k, v in params.items() if k != "question_embedding"}

In [20]:
cleaned_dict

{'parent_id': 9,
 'questions': [{'question_id': 'q-0',
   'text': 'What is the name of the restaurant that seeks out the finest and freshest ingredients from across the globe?',
   'category': 'Restaurant',
   'points': 100,
   'question_embedding': [0.008569878526031971,
    -0.019916662946343422,
    0.015080329030752182,
    0.0013303242158144712,
    -0.007314291782677174,
    0.01591738685965538,
    -0.022746717557311058,
    0.0006182436482049525,
    -0.009592949412763119,
    -0.0036704326048493385,
    0.013725092634558678,
    -0.006566918455064297,
    -0.015279628336429596,
    -0.015359348617494106,
    -0.005623567849397659,
    -0.0018717545317485929,
    0.03188792243599892,
    -0.0012663822853937745,
    0.01341949962079525,
    -0.028486542403697968,
    -0.027144592255353928,
    -0.009028267115354538,
    0.014814596623182297,
    0.002198937814682722,
    0.01341949962079525,
    0.006862546317279339,
    0.027011726051568985,
    -0.009267427027225494,
    0.009