In [1]:
import json
from pathlib import Path
from typing import List

from langchain_community.document_loaders import TextLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import TokenTextSplitter, RecursiveCharacterTextSplitter
from neo4j.exceptions import ClientError

## Init our tools - LLM and Graph database

In [3]:
import os


# Connect to Neo4j Aura
uri = "neo4j+s://1b2ebe15.databases.neo4j.io"
username = "neo4j"
password = 'xfG1Z0WseTkszMidYnnyfrN0OTn6xkM-u03K31q691w'
print(password)

graph = Neo4jGraph(uri, username, password)

# Embeddings & LLM models
embeddings = OpenAIEmbeddings()
embedding_dimension = 1536
llm = ChatOpenAI(model="gpt-4o-2024-08-06", temperature=0)

xfG1Z0WseTkszMidYnnyfrN0OTn6xkM-u03K31q691w


## Get the text from PDF file

In [4]:
import re
from langchain_core.documents.base import Document

def docs_to_json_pretty(docs: List[Document]):
    return json.dumps([doc.dict() for doc in docs], indent=4)

In [5]:
%pip install pymupdf4llm
import pymupdf4llm

md_text = pymupdf4llm.to_markdown("menu.pdf")

# now work with the markdown text, e.g. store as a UTF8-encoded file
import pathlib
pathlib.Path("dinner.md").write_bytes(md_text.encode())


[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
Processing menu.pdf...


13610

In [6]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("###", "Category"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header3_splits = markdown_splitter.split_text(md_text)
categories = [split.metadata['Category'] for split in md_header3_splits if 'Category' in split.metadata]
categories_str = ', '.join(categories)
categories_str

'APPETIZERS, SOUP & SALAD, PASTA, ENTRÉES, SIDES, PIZZAS, PERSONALIZED PIES'

In [7]:
i=7
print(md_header3_splits[i].metadata["Category"])
print(md_header3_splits[i].page_content)

PERSONALIZED PIES
###### PLAIN CHEESE  
ADDITIONAL TOPPINGS  
###### MEDIUM . . . . . . . . . . . $12  
MEDIUM . . . . . . $2 EACH  
###### LARGE. . . . . . . . . . . $15  
LARGE. . . . . . $4 EACH  
###### Pepperoni | Crushed Garlic | Chicken Breast Anchovies | Spinach | Gorgonzola Italian Sausage | Ricotta Cheese | Pesto Artichoke Hearts | Prosciutto | Black Olives | Green Peppers White Mushrooms | Red Onions  
We are happy to accommodate certain modifications but they may incur an additional charge  
A $3.00 split fee is assessed for all shared items. Large pizzas and family items are not valid with discount cards.  
There is a $3 per person dessert fee for any outside desserts, and a $30 corkage fee.  
-----  
## Welcome to Mama Ricotta’s  
I opened Mama Ricotta’s in August of 1992 to offer the delicious homestyle Italian-American food I remember fondly from  
my youth. Back in 1908, my grandfather emigrated from Quindice, Italy to Springfield, Massachusetts to find work as a  
sto

## Prompt Plumbing for Jeopardy Questions with Points and Answers in Categories

In [8]:
class JeopardyQuestion(BaseModel):
    """Structuring questions with categories, points, and answers."""
    category: str = Field(..., description="Jeopardy-style category")
    question: str = Field(..., description="Generated question")
    answer: str = Field(..., description="Generated answer")
    points: int = Field(..., description="Point value associated with the question")

class JeopardyQuestions(BaseModel):
    """Generating hypothetical Jeopardy-style questions with answers."""
    questions: List[JeopardyQuestion] = Field(
        ..., description="List of questions with their categories, answers, and points"
    )

questions_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                f"You are generating Jeopardy-style questions with points and answers in the following categories {categories_str}"
                "based on the information found in the text. "
                "Each question should have only one correct answer and a corresponding category. "
                "Each question should have a point value ranging from 100 to 500 depending on the difficulty of the question. "
            ),
        ),
        (
            "human",
            (
                "Use the given format to generate Jeopardy-style questions with difficulty point value and answers in corresponding category from the "
                "following input: {input}"
            ),
        ),
    ]
)

question_chain = questions_prompt | llm.with_structured_output(JeopardyQuestions)



In [9]:
graph.query("match (n:Chunk|Question|Answer|Category) detach delete n")

[]

## Generate and Ingest Questions with Points and Answers for Categories

In [10]:
for chunk_id, chunk in enumerate(md_header3_splits):
    # ignore document sections that do not have a category
    if "Category" not in chunk.metadata:
        continue

    jeopardy_data = question_chain.invoke(chunk.page_content)
    questions = jeopardy_data.questions
    # print(docs_to_json_pretty(questions))

    params = {
        "chunk_id": chunk_id,
        "questions": [
            {
                "question_id": f"c-{chunk_id}-q-{question_id}",
                "text": q.question,
                "category_id": f"c-{chunk_id}-cat-{categories.index(q.category)}",
                "category_name": q.category,
                "points": q.points,
                "question_embedding": embeddings.embed_query(q.question),
                "answer_id": f"c-{chunk_id}-a-{question_id}",
                "answer_text": q.answer,
                "answer_embedding": embeddings.embed_query(q.answer)
            }
            for question_id, q in enumerate(questions)
        ],
    }

    # print(json.dumps(params, indent=4))

    # create question nodes
    graph.query(
        """
        UNWIND $questions AS question
        MERGE (chunk:Chunk {id: $chunk_id})
        WITH chunk, question
        MERGE (cat:Category {id: question.category_id, name: question.category_name})
        WITH chunk, cat, question
        CREATE (q:Question {id: question.question_id})
        CREATE (a:Answer {id: question.answer_id})
        SET q.text = question.text, q.points = question.points
        SET a.text = question.answer_text
        MERGE (cat)<-[:IN_CATEGORY]-(q)-[:FOR_CONTENT]->(chunk)
        MERGE (q)-[:HAS_ANSWER]->(a)

        WITH q, question

        CALL db.create.setVectorProperty(q, 'embedding', question.question_embedding) 
        YIELD node
        RETURN count(*)
    """,
        params,
    )

    # Set vector embedding for answers
    graph.query(
        """
        UNWIND $questions AS question
        MATCH (a:Answer {id: question.answer_id})
        WITH a, question

        CALL db.create.setVectorProperty(a, 'embedding', question.answer_embedding) 
        YIELD node
        RETURN count(*)
        """,
        params,
    )
    
    # Create vector indexes for questions
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('jeopardy_questions', "
            "'Question', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

    # Create vector indexes for answers
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('jeopardy_answers', "
            "'Answer', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

In [14]:
def check_user_answer_using_neo4j(user_answer, question_id):
    user_answer_embedding = embeddings.embed_query(user_answer)
    params = {
        "user_answer_embedding":user_answer_embedding, 
        "question_id":question_id,
        "user_answer": user_answer
    }

    cosine_similarity_query = """
    WITH $user_answer_embedding AS user_embedding  
    MATCH (q:Question {id: $question_id})-[:HAS_ANSWER]->(a:Answer)
    WITH user_embedding, a, a.embedding AS answer_embedding
    RETURN a.text AS answerText, 
        gds.similarity.cosine(user_embedding, answer_embedding) AS similarity
    """

    jaro_winkler_distance_query = """
    WITH $user_answer AS user_answer
    MATCH (q:Question {id: $question_id})-[:HAS_ANSWER]->(a:Answer)
    WITH a.text as correct_answer, user_answer
    RETURN correct_answer, user_answer, 
        apoc.text.jaroWinklerDistance(correct_answer, user_answer) AS output
    """

    result = graph.query(jaro_winkler_distance_query, params)

    print(result)

    return None

%pip install textdistance
import textdistance as td
def check_user_answer_using_td(user_answer, question_id):
    params = {
        "question_id":question_id,
        "user_answer": user_answer
    }
    get_correct_answer_query = """
    MATCH (q:Question {id: $question_id})-[:HAS_ANSWER]->(a:Answer)
    RETURN a.text AS correct_answer
    """
    result = graph.query(get_correct_answer_query, params)
    distance_similarity = td.jaro_winkler(result[0]["correct_answer"], user_answer)
    print(result[0]["correct_answer"], user_answer, distance_similarity)
    return  distance_similarity > 0.8


# check_user_answer("lettuce", "c-2-q-3")
check_user_answer_using_td("cheese", "c-2-q-1")


Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3
Note: you may need to restart the kernel to use updated packages.
Fontina cheese cheese 0.41269841269841273



[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


False

In [15]:
import textdistance as td
def print_distance(answer, user_answer):
    print(td.levenshtein.normalized_similarity(answer, user_answer))
    print(td.damerau_levenshtein.normalized_similarity(answer, user_answer))
    print(td.jaro(answer, user_answer))
    print(td.jaro_winkler(answer, user_answer))

print_distance("$3", "$3.3")

0.5
0.5
0.8333333333333334
0.8666666666666667
