In [1]:
from pathlib import Path
from typing import List

from langchain_community.document_loaders import TextLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import TokenTextSplitter
from neo4j.exceptions import ClientError

In [2]:
graph = Neo4jGraph(database="jeopardy") # unforturnately that specifying db does not work
db_name = "jeopardy"

# Embeddings & LLM models
embeddings = OpenAIEmbeddings()
embedding_dimension = 1536
llm = ChatOpenAI(temperature=0)

In [4]:
# Load the text file
loader = TextLoader(str("extracted_text.txt"), encoding="utf-8")
documents = loader.load()

# Ingest Parent-Child node pairs
parent_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
child_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24)
parent_documents = parent_splitter.split_documents(documents)
print(documents)

[Document(metadata={'source': 'extracted_text.txt'}, page_content="DINNER\nAPPETIZERS\nSHORT RIB ARANCINI . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . $11\nFried balls of risotto and tender beef short rib served with our tomato basil sauce\nFRESH STRACCIATELLA CHEESE & ANTIPASTO PLATE (SERVES 2-3) . . . . . . . . . . . . . . . . . . . . . . . . . . . $25\nSoft freshly made mozzarella ribbonss in rich cream topped with lemon zest, olive oil and pistachios \naccompanied by soppresatta, oven-dried tomatoes and giardiniera\nBAKED WAGYU-PORK MEATBALL. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . $15\nWagyu beef and Kurobuta pork meatballs broiled with fresh mozzarella & tomato sauce\nGOAT CHEESE & MASCARPONE DIP. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  $12\nServed with 

## Ingest Document parents and children into knowledge graph

In [6]:
for i, parent in enumerate(parent_documents):
    child_documents = child_splitter.split_documents([parent])
    params = {
        "parent_text": parent.page_content,
        "parent_id": i,
        "parent_embedding": embeddings.embed_query(parent.page_content),
        "children": [
            {
                "text": c.page_content,
                "id": f"{i}-{ic}",
                "embedding": embeddings.embed_query(c.page_content),
            }
            for ic, c in enumerate(child_documents)
        ],
    }
    # Ingest data
    graph.query(
    f"USE {db_name}"
    """
    MERGE (p:Parent {id: $parent_id})
    SET p.text = $parent_text
    WITH p
    CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding)
    YIELD node
    WITH p 
    UNWIND $children AS child
    MERGE (c:Child {id: child.id})
    SET c.text = child.text
    MERGE (c)<-[:HAS_CHILD]-(p)
    WITH c, child
    CALL db.create.setVectorProperty(c, 'embedding', child.embedding)
    YIELD node
    RETURN count(*)
    """,
        params,
    )
    # Create vector index for child
    try:
        graph.query(
            f"USE {db_name} "
            "CALL db.index.vector.createNodeIndex('parent_document', "
            "'Child', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass
    # Create vector index for parents
    try:
        graph.query(
            f"USE {db_name} "
            "CALL db.index.vector.createNodeIndex('typical_rag', "
            "'Parent', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

## Generate Jeopardy Categories

In [7]:
# Generate Jeopardy Categories
class JeopardyCategories(BaseModel):
    """Generating Jeopardy-style Categories."""
    categories: List[str] = Field(..., description="List of generated categories")

categories_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                "You are to generate five distinct Jeopardy-style categories "
                "based on the information found in the text. "
            ),
        ),
        (
            "human",
            (
                "Use the given format to generate Jeopardy-style categories from the "
                "following input: {input}"
            ),
        ),
    ]
)

categories_chain = categories_prompt | llm.with_structured_output(JeopardyCategories)
jeopardy_categories = categories_chain.invoke(documents)
jeopardy_categories_str = ', '.join(jeopardy_categories.categories)
print(jeopardy_categories_str)

Italian Cuisine, Appetizers, Pasta Dishes, Salads, Entrees


## Prompt Plumbing for Jeopardy Questions with Points and Answers in Categories

In [8]:
class JeopardyQuestion(BaseModel):
    """Structuring questions with categories, points, and answers."""
    category: str = Field(..., description="Jeopardy-style category")
    question: str = Field(..., description="Generated question")
    answer: str = Field(..., description="Generated answer")
    points: int = Field(..., description="Point value associated with the question")

class JeopardyQuestions(BaseModel):
    """Generating hypothetical Jeopardy-style questions with answers."""
    questions: List[JeopardyQuestion] = Field(
        ..., description="List of questions with their categories, answers, and points"
    )

questions_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                f"You are generating Jeopardy-style questions with points and answers for one or more of these categories {jeopardy_categories_str} "
                "based on the information found in the text. For each question also generate a correct answer."
                "Each question should have a point value ranging from 100 to 500 depending on the difficulty of the question. "
            ),
        ),
        (
            "human",
            (
                "Use the given format to generate Jeopardy-style questions with difficulty point value and answers from the "
                "following input: {input}"
            ),
        ),
    ]
)

question_chain = questions_prompt | llm.with_structured_output(JeopardyQuestions)



## Generate and Ingest Questions with Points and Answers for Categories

In [10]:
for parent_id, parent in enumerate(parent_documents):
    jeopardy_data = question_chain.invoke(parent.page_content)
    questions = jeopardy_data.questions
    print(questions)

    params = {
        "parent_id": parent_id,
        "questions": [
            {
                "question_id": f"q-{question_id}",
                "text": q.question,
                "category": q.category,
                "points": q.points,
                "question_embedding": embeddings.embed_query(q.question),
                "answer_id": f"a-{question_id}",
                "answer_text": q.answer,
                "answer_embedding": embeddings.embed_query(q.answer)
            }
            for question_id, q in enumerate(questions)
        ],
    }
    graph.query(
        f"USE {db_name}"
        """
        UNWIND $questions AS question
        MATCH (p:Parent {id: $parent_id})
        WITH p, question
        MERGE (cat:Category {id: question.category, name: question.category})
        WITH p, cat, question
        CREATE (q:Question {id: question.question_id})
        SET q.text = question.text, q.points = question.points
        MERGE (q)-[:IN_CATEGORY]->(cat)-[:FOR_PARENT]->(p)
        WITH q, question

        CALL db.create.setVectorProperty(q, 'embedding', question.question_embedding) 
        YIELD node
        RETURN count(*)
    """,
        params,
    )

    # Ingest generated answers
    graph.query(
        f"USE {db_name} "
        """
        UNWIND $questions AS question
        MATCH (q:Question {id: question.question_id})
        WITH q, question
        CREATE (a:Answer {id: question.answer_id})
        SET a.text = question.answer_text
        MERGE (q)-[:HAS_ANSWER]->(a)
        WITH a, question

        CALL db.create.setVectorProperty(a, 'embedding', question.answer_embedding) 
        YIELD node
        RETURN count(*)
        """,
        params,
    )
    
    # Create vector indexes for questions and answers
    try:
        graph.query(
            f"USE {db_name} "
            "CALL db.index.vector.createNodeIndex('jeopardy_questions', "
            "'Question', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

    # Create vector indexes for questions and answers
    try:
        graph.query(
            f"USE {db_name} "
            "CALL db.index.vector.createNodeIndex('jeopardy_answers', "
            "'Answer', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

[JeopardyQuestion(category='Appetizers', question='What is the name of the appetizer that consists of fried balls of risotto and tender beef short rib served with tomato basil sauce?', answer='Short Rib Arancini', points=200), JeopardyQuestion(category='Appetizers', question='What appetizer includes soft freshly made mozzarella ribbons in rich cream topped with lemon zest, olive oil, and pistachios, accompanied by soppresatta, oven-dried tomatoes, and giardiniera?', answer='Fresh Stracciatella Cheese & Antipasto Plate', points=300), JeopardyQuestion(category='Appetizers', question='Which appetizer features Wagyu beef and Kurobuta pork meatballs broiled with fresh mozzarella and tomato sauce?', answer='Baked Wagyu-Pork Meatball', points=400), JeopardyQuestion(category='Appetizers', question="What appetizer, a Mama Ricotta's favorite, is served with grilled ciabatta for dipping and topped with a warm tomato basil sauce?", answer='Goat Cheese & Mascarpone Dip', points=300), JeopardyQuesti