# Vaudeville RAG

Do not run all if the kernel restarts!

In [10]:
# Imports
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders.csv_loader import CSVLoader
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import PDFPlumberLoader
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing_extensions import List, TypedDict, Optional
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import START, StateGraph

## Indexing

### Loading Files

In [11]:
# Setting up csv loading
# We have extra metadata in the CSV to add to each document.


def loadCSV(filepath: str) -> list:
    loader = CSVLoader(file_path=filepath, source_column="FileName",metadata_columns=["Year","Work","Theatre"])
    data = loader.load()
    return data


In [12]:
# Load CSV

csv_metadata = loadCSV(r"C:\Users\charl\Documents\VSCode\Vaudeville\Files\Vaudeville_Metadata.csv")
print(csv_metadata[0].metadata['source'])
print(csv_metadata[1].metadata['Theatre'])

As_tu_vu_la_comete_mon_gas_text.pdf
Théâtre du Vaudeville


In [13]:
# Setting up pdf loading




async def loadPDF(filepath: str) -> list:
    loader = PDFPlumberLoader(filepath)
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)   
    return pages


In [14]:

def get_files_from_directory(directory_path: str) -> list[str]:
    directory = Path(directory_path)
    file_paths = [str(file) for file in directory.iterdir() if file.is_file()]
    return file_paths


directory_path:str = r"C:\Users\charl\Documents\VSCode\Vaudeville\Files\PDFs"
files: list[str] = get_files_from_directory(directory_path)

In [15]:
# Load PDFs w/out metadata
loaded_PDFs: list = []
for file in files:
    pages = await loadPDF(file)
    loaded_PDFs.append(pages)

In [16]:
# Check PDF metadata
for i in range(3):
    document = loaded_PDFs[i]
    print(f"Document {i} \n")
    document_metadata: list = []
    for document_page in document:
        document_metadata += document_page.metadata.items()
    print(document_metadata)

Document 0 

[('source', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('file_path', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('page', 0), ('total_pages', 104), ('CreationDate', "D:20130826123210+02'00'"), ('Creator', 'Bibliothèque nationale de France'), ('ModDate', "D:20250624190557-04'00'"), ('Producer', 'iText 1.4.8 (by lowagie.com)'), ('Title', "Cogniard, Théodore (1806-1872),Clairville (1811-1879). As-tu vu la comète, mon gas ? revue de l'année 1858, en 3 actes et 4 tableaux, par MM. Théodore Cogniard et Clairville... (Paris, Variétés, 30 décembre 1858.). 1859."), ('source', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('file_path', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('page', 1), ('total_pages', 104), ('CreationDate', "D:20130826123210+02'00'")

So, some of this info is useful and some isn't.

We should keep:
* Source (but clean it so it's just the filename)
* Filepath
* Page
* Total pages
* Creator
* Title

We should remove:
* CreationDate (the date it was turned to pdf is not relevant)
* ModDate
* Producer

We also want to add the CSV metadata in as extra rows in the metadata dictionary

In [17]:
# Cleaning the sources to match with the CSV source names
for source in loaded_PDFs:
    for page in source:
        page.metadata['source'] = page.metadata['source'].replace("C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\PDFs\\","")

  page.metadata['source'] = page.metadata['source'].replace("C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\PDFs\\","")


In [18]:
# Sorting both the CSV and PDF metadata to match
csv_metadata.sort(key=lambda x: x.metadata['source'])
loaded_PDFs.sort(key=lambda x: x[0].metadata['source'])

In [19]:
# Use assert to ensure the sources match
for i in range(32,len(csv_metadata)):
    assert csv_metadata[i].metadata['source'] == loaded_PDFs[i][0].metadata['source'], f"Mismatch at index {i}: {csv_metadata[i].metadata['source']} != {loaded_PDFs[i][0].metadata['source']}"

I had to change the lower index because there were various slight mismatches, but we've checked that all files are actually the same so we're moving on.

#### Cleaning PDF Metadata and Adding CSV Content

We should remove:
* CreationDate (the date it was turned to pdf is not relevant)
* ModDate
* Producer

In [20]:
print(len(csv_metadata))
print(len(loaded_PDFs))

35
35


In [21]:
irrelevant_metadata = ['CreationDate', 'ModDate', 'Producer']

for i in range(len(loaded_PDFs)):
    document = loaded_PDFs[i]
    for page in document:
        for key in irrelevant_metadata:
            if key in page.metadata:
                del page.metadata[key]
        # Add the category and year from the CSV metadata
        # assert page.metadata['source'] == csv_metadata[i].metadata['source'], "Source mismatch between PDF and CSV metadata"
        for row in csv_metadata[i].metadata.items():
            page.metadata.update({row[0]: row[1]})

In [22]:
# Convert each sublist into a single document
def convert_list_to_document(pages: list) -> Document:
    document_content: str = ""
    for page in pages:
        document_content += page.page_content
    document: Document = Document(
        page_content=document_content,
        metadata=pages[0].metadata  # Use the metadata from the first page
    )
    return document

# Convert loaded PDFs to documents
docs = []
for source in loaded_PDFs:
    doc = convert_list_to_document(source)
    docs.append(doc)  # Append the single Document object

In [23]:
docs.pop(0)  # Remove the first document because it rendered as empty
docs.pop(0)  # Remove the second document because it rendered as empty
# To get these working, we'll have to OCR them first.

Document(metadata={'source': 'Brutus_ou_le_dernier_soldat.pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\Brutus_ou_le_dernier_soldat.pdf', 'page': 0, 'total_pages': 47, 'row': 1, 'Year': '1843', 'Work': 'Brutus ou le dernier soldat du guet', 'Theatre': 'Théâtre du Vaudeville'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')

In [24]:
# Double check metadata
for i in range(0,10):
    print(f"Document {i} Metadata: {docs[i].metadata}")

Document 0 Metadata: {'source': 'Clairville_Garet_Le_Palais_de_Chrysocale_(1855).pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\Clairville_Garet_Le_Palais_de_Chrysocale_(1855).pdf', 'page': 0, 'total_pages': 14, 'Author': 'Clairville (M.)', 'Title': 'Le palais de chrysocale ou les exposants et les exposés', 'row': 2, 'Year': '1855', 'Work': 'Le Palais de Chrysocale', 'Theatre': 'Théâtre des Variétés'}
Document 1 Metadata: {'source': 'Clairville_Thiboust_Le_quart_du_monde_(1855).pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\Clairville_Thiboust_Le_quart_du_monde_(1855).pdf', 'page': 0, 'total_pages': 10, 'Author': 'Lambert-Thiboust (1826-1867)', 'Creator': 'Bibliothèque nationale de France', 'Title': 'Le Quart de monde, ou le Danger d\'une particulière pleine de malice pour un individu vraiment impressionnable (parodie du "Demi-monde"), étude réaliste mêlée de couplets et d\'effets de style, par MM. Clairville et Lamb

### Chunking Files

In [25]:
# Here, we're just splitting the documents by an arbitrary character count. 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # chunk size (characters)
    chunk_overlap=400,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split {len(docs)} PDFs into {len(all_splits)} sub-documents.")

Split 33 PDFs into 1415 sub-documents.


### Embedding and adding to the vector database

In [26]:
print(len(all_splits))

1415


Warning!!! Don't run this next cell more than once, even if your kernel dies or you restart VSCode. Once  these have been added to your vector store, they'll be stored in your persistent directory. If you run it again, you'll have duplicates!!

In [27]:
from langchain_chroma import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where we stored our data before
)

In [28]:
# If we run this again when reestablishing variables, it will duplicate documents and charge us

# Splitting into three chunks to avoid overloading the vector store - you can only add so many documents at once

# They're commented out so I don't accidentally run this again and duplicate the documents in the vector store.
"""
vector_store.add_documents(documents=all_splits[:350])
vector_store.add_documents(documents=all_splits[350:700])
vector_store.add_documents(documents=all_splits[700:1050])
vector_store.add_documents(documents=all_splits[1050:])
"""

'\nvector_store.add_documents(documents=all_splits[:350])\nvector_store.add_documents(documents=all_splits[350:700])\nvector_store.add_documents(documents=all_splits[700:1050])\nvector_store.add_documents(documents=all_splits[1050:])\n'

## Setting Up Retrieval and Generation w/ Filters

You'll notice that this next cell has some of the same code as above. This is because this cell sets up everything, so that if our kernel dies or if we close VSCode, we can just run this cell (and nothing above) and we'll be able to ask questions about our data. This is because our full vector store is kept locally in our directory, using its persistent directory.

In [29]:
# Setting up chat model
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o", model_provider="openai") # Could be any chat model. We use gpt-4o here because analyzing poetry is a pretty complex task.

# Setting up embeddings
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Reestablishing up our persist directory for Chroma
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # This is reused from above, but reestablishes our vector store if we only run this cell.
)

# Reestablishing retrieval and generation functions
from langchain_core.documents import Document
from typing_extensions import List, TypedDict, Optional
from langchain_core.prompts import ChatPromptTemplate


# This is where we set up our system prompt (tell the llm its job), and the format for our question and context.
prompt = ChatPromptTemplate.from_messages([
    ("system", 'You are an expert in French musical theatre and opera of the 19th century.  You will help us explore the texts (librettos) of vaudevilles, which are dramatic productions that include periodic musical numbers, which might be songs, choruses, instrumental dances, and other sonic events. \n '
    'Many of these musical moments reuse some preexisting (and often well-known)  melody or tune.  These are variously called “melodie”, or “air”, and identified with a short title that refers in some way to an opera or collection of melodies from which it was drawn.  The titles might include the names of works, or other characters in those original works. '
    '\n In the context of the plays, these tunes become the vehicle for newly composed lyrics, which are normally rhymed, and which normally follow the poetic scansion and structure of the original lyrics.  Rhyme, versification and structure are thus of interest to us. \n Keep the following attributes in mind as you answer questions based on the texts provided:'
    '-the act and scene in which the event takes place'
    '-the character or characters who are singing (or otherwise making music),'
    '-the dramatic situation (a love scene, a crowd scene)'
    '-the name of the air or melodie'
    '-the poetic text'
    '-the rhyme scheme'
    '-form of the poetic text, which might involve some refrain'
    '-the end accent for each line (masculine or féminine)'
    '-syllable count for each line'
    '-any irregularities you notice'
    'If you cannot find the answers for anything within the context provided, state "I dont know" or similar - do not make assumptions beyond what is actually in the context.\n\n'),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])

# This stores the state of our application, which includes the question, filter, context, and answer. It is required to use langgraph, and is the sole argument of all steps.
class State(TypedDict):
    question: str
    filter: Optional[dict]
    context: List[Document]
    answer: str

# These are all nodes in our graph, or steps. This one allows you to apply a metadata filter to the documents in the vector store.
def apply_filter(state: State):
    if "filter" in state and state["filter"]:
        state["filter"] = {k: v for k, v in state["filter"].items() if v is not None}
    else:
        state["filter"] = None
    return state

# Uses similarity search to retrieve the 15 most similar documents to the question from the vector store, applying any filter if provided. The k value determines the number of documents.
def retrieve(state: State):
    filter_dict = state["filter"] if state.get("filter") else None
    retrieved_docs = vector_store.similarity_search(state["question"], k=15, filter=filter_dict)
    return {"context": retrieved_docs}

# Generates an answer to the question using the retrieved context. It formats the context and question into a message, invokes the LLM, and returns the answer.
def generate(state: State):
    docs_content = ""
    for doc in state["context"]:
        docs_content += f"The following text is a chunk from document titled: {doc.metadata["Work"]}. There are a total of {doc.metadata["total_pages"]} pages in this play. \n"
        docs_content += doc.page_content + "\n\n"
    message = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(message)
    return {"answer": response.content}

# Setting up the sequence of the graph. 
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([apply_filter, retrieve, generate])
graph_builder.add_edge(START, "apply_filter")
graph = graph_builder.compile()

In [30]:
def analyze_play(i: int):

    if i < 0 or i >= len(docs):
        raise ValueError("Index out of range for the documents list.")
    
    # Apply the graph to the state
    message = prompt.invoke({"question": "Analyze this play according to the system prompt.", "context": docs[i].page_content})
    return llm.invoke(message).content


## Testing

The full setup is done and now we can test. As you'll see, since there are so many chunks it has trouble pulling accurate ones, so the LLM gets confused. Thus, we add in another node to check the accuracy of documents before sending them below. You can then see that the query is more accurate.

In [31]:
result = graph.invoke({"question": "Tell me about the elements of Un Docteur en Herbe"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: In "Un Docteur en Herbe," we are dealing with a vaudeville, a comedic play that incorporates songs and musical elements. Let me address the elements you've asked for:

**Act and Scene:** The provided excerpts span across various scenes from the play's two acts. Specific scenes mentioned include Act I, Scene I to Act II, Scene XVI.

**Characters:** Some of the prominent characters are:
- Derbigny, a father figure
- Isidore, his son and a central character in comedic scenarios
- Lambert, a young doctor
- Delapierre Meulière, another older character
- Pauline, a young woman of interest, possibly a love interest for Isidore
- Pamela, a grisette with connections to the characters
- Other characters include townspeople and minor roles.

**Dramatic Situations:** 
- Isidore's preparation for an examination and the comedic mishaps that follow.
- Love and flirtatious moments, like Isidore's interactions with Pauline.
- Moments of misunderstanding, comical confrontations, and reconciliati

In [32]:
result = graph.invoke({"question": "Tell me about the elements of Un Docteur en Herbe",
                       "filter": {"Work": "Un Docteur en herbe"}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: "Un Docteur en Herbe" is a comédie-vaudeville in two acts by MM. Duvert et Lausanne. It premiered at the Théâtre du Palais-Royal in Paris on April 1, 1847. Here's a breakdown of its elements based on the provided excerpts:

1. **Act and Scene Structure:**
   - The play is divided into two acts. The excerpts provide insights into scenes spanning across both acts.

2. **Characters:**
   - **Derbigny:** A character who appears to be authoritative and possibly a father figure, reminding Isidore about his exam.
   - **Isidore:** A young character, Derbigny's son, portrayed as somewhat careless and naive.
   - **Lambert:** A young doctor, friend to Isidore, with a somewhat playful and humorous disposition.
   - **Delapierre Meulière:** Another character involved in a subplot involving unions and dowry.
   - **Pauline:** A young woman of 17, associated with Isidore.
   - **Paméla:** Described as a "grisette," a term for a young working-class woman.

3. **Dramatic Situations:**
   - Th

In [33]:
result = graph.invoke({"question": "Compare Un Docteur en Herbe to Les Artistes par occasion.",
                       "filter": {"$or": [{"Work": "Un Docteur en herbe"},{"Work": "Les Artistes par occasion"}]}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: Comparing "Un Docteur en Herbe" to "Les Artistes par occasion" provides us insight into the typical structure and thematic elements of 19th-century French vaudeville. 

1. **Structure and Setting:**
   - "Un Docteur en Herbe" is a two-act vaudeville that takes place in different settings: an inn in Paris for the first act and Derbigny's home in Briare for the second act. This shift in location might represent a change in the social environment or the escalation of the dramatic narrative.
   - "Les Artistes par occasion," on the other hand, is a one-act comedy set in Tivoli, in the outskirts of Rome. The setting is an extensive, picturesque garden, indicative of an open and airy setting that may support more lighthearted or sentimental scenes.

2. **Characters and Roles:**
   - In "Un Docteur en Herbe," the distribution of roles includes various members from different generations and engages in familial and romantic conflict. We observe characters like Derbigny, his son Isidore,

In [34]:
result = graph.invoke({"question": "Compare the elements of Un Docteur en Herbe to Les Artistes par occasion.",
                       "filter": {"$or": [{"Work": "Un Docteur en herbe"},{"Work": "Les Artistes par occasion"}]}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: To intelligently compare the elements of "Un Docteur en Herbe" to "Les Artistes par Occasion", let's focus on the structural and thematic facets, including characters, dramatic situations, and musical aspects as expressed through the librettos provided:

1. **Structure and Scenes:**
   - "Un Docteur en Herbe" is a comédie-vaudeville in two acts. Scenes involve domestic interiors like hotel rooms and homes. The play mixes comedy with everyday relational and societal conflicts, notably concerning education and professional expectations.
   - "Les Artistes par Occasion" is a comedic piece in one act interspersed with ariettes, a type of short opera piece. It involves a setting in a picturesque Italian garden, using a lighter and more romantic tone focusing on artistic enthusiasts.

2. **Characters:**
   - In "Un Docteur en Herbe", characters include Derbigny, Isidore, and Lambert, with roles revolving around family, expectations, and youthful misadventures in professional realms l

In [35]:
result = graph.invoke({"question": "How do the poetic elements of Un Docteur en Herbe compare to other works?"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: The poetic elements of "Un Docteur en Herbe" as seen in the provided excerpts include various forms typical of 19th-century French vaudeville, with a notable emphasis on rhyme, versification, and integration of well-known melodies with new lyrics. 

In "Un Docteur en Herbe," several songs feature clearly defined rhyme schemes, often employing rhymed couplets or alternate rhyme patterns—a hallmark of the genre intended to maintain a light, comedic, and sometimes whimsical tone. For instance, the air "Ne vois-tu pas, jeune imprudent" uses rhyming couplets, a common technique in vaudeville to provide rhythmic cohesion and enhance the comedic effect.

The syllable count in the lines tends to be consistent with traditional French versification, although not every line may adhere strictly to classical alexandrine form (12 syllables), demonstrating flexibility typical in vaudeville for comedic effect or to fit the rhythm of the pre-existing melody. The accentuation at the end of the l

In [36]:
result = graph.invoke({"question": "What airs / melodies are present in Les Modistes?", "filter": {"Work": "Les Modistes"}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: I don't know. The context provided does not include specific information about the airs or melodies present in "Les Modistes." Additional information or a specific excerpt from the libretto would be needed to identify the airs used in this vaudeville.


Sources:


In [37]:
result = graph.invoke({"question": "What types of airs / melodies are present in dramatic situations of romance?"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: In the provided excerpts from various 19th-century French plays, we encounter instances of airs or melodies used in romantic dramatic situations. Here are some examples: 

1. **"Coraly"**: The text references an "AIR du Fleuve de la vie," indicating a romantic melodic moment where the character Roland expresses his emotions towards another with lyrics that highlight beauty and inspiration. This suggests a setting of romantic contemplation or confession.

2. **"La Dette d’honneur"**: The air "Femmes, voulez-vous éprouver" is used in a light-hearted romantic context where Madame Delaunay speaks to Pauline, reassuring her about a lover's spat and suggesting an underlying romance. The air used here likely supports a playful yet sincere interaction about romantic relationships.

3. **"Le Fifre du roi de Prusse, ou les Prisonniers de Spandau"**: An air "De Doche" sets a romantic scene with Amélie and Alfred where Alfred reminisces about Amélie's charm and seeks her affection through 

## Fixes

In [38]:
# Setting up chat model
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o", model_provider="openai") # Could be any chat model. We use gpt-4o here because analyzing poetry is a pretty complex task.

# Setting up embeddings
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Reestablishing up our persist directory for Chroma
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # This is reused from above, but reestablishes our vector store if we only run this cell.
)

# Reestablishing retrieval and generation functions
from langchain_core.documents import Document
from typing_extensions import List, TypedDict, Optional
from langchain_core.prompts import ChatPromptTemplate


# This is where we set up our system prompt (tell the llm its job), and the format for our question and context.
prompt = ChatPromptTemplate.from_messages([
    ("system", 'You are an expert in French musical theatre and opera of the 19th century.  You will help us explore the texts (librettos) of vaudevilles, which are dramatic productions that include periodic musical numbers, which might be songs, choruses, instrumental dances, and other sonic events. \n '
    'Many of these musical moments reuse some preexisting (and often well-known)  melody or tune.  These are variously called “melodie”, or “air”, and identified with a short title that refers in some way to an opera or collection of melodies from which it was drawn.  The titles might include the names of works, or other characters in those original works. '
    '\n In the context of the plays, these tunes become the vehicle for newly composed lyrics, which are normally rhymed, and which normally follow the poetic scansion and structure of the original lyrics.  Rhyme, versification and structure are thus of interest to us. \n Keep the following attributes in mind as you answer questions based on the texts provided:'
    '-the act and scene in which the event takes place'
    '-the character or characters who are singing (or otherwise making music),'
    '-the dramatic situation (a love scene, a crowd scene)'
    '-the name of the air or melodie'
    '-the poetic text'
    '-the rhyme scheme'
    '-form of the poetic text, which might involve some refrain'
    '-the end accent for each line (masculine or féminine)'
    '-syllable count for each line'
    '-any irregularities you notice'
    'If you cannot find the answers for anything within the context provided, state "I dont know" or similar - do not make assumptions beyond what is actually in the context.\n\n'),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])

# This stores the state of our application, which includes the question, filter, context, and answer. It is required to use langgraph, and is the sole argument of all steps.
class State(TypedDict):
    question: str
    filter: Optional[dict]
    context: List[Document]
    answer: str

# These are all nodes in our graph, or steps. This one allows you to apply a metadata filter to the documents in the vector store.
def apply_filter(state: State):
    if "filter" in state and state["filter"]:
        state["filter"] = {k: v for k, v in state["filter"].items() if v is not None}
    else:
        state["filter"] = None
    return state

# Uses similarity search to retrieve the 15 most similar documents to the question from the vector store, applying any filter if provided. The k value determines the number of documents.
def retrieve(state: State):
    filter_dict = state["filter"] if state.get("filter") else None
    retrieved_docs = vector_store.similarity_search(state["question"], k=15, filter=filter_dict)
    return {"context": retrieved_docs}


retriever = vector_store.as_retriever()

### Retrieval Grader


from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from pydantic import BaseModel, Field


# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


# LLM with function call
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with only filtered relevant documents
    """

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["context"]

    # Score each doc
    filtered_docs = []
    for i,d in enumerate(documents):
        score = retrieval_grader.invoke(
            {"question": question, "document": d.page_content}
        )
        grade = score.binary_score
        if grade == "yes":
            print(f"---GRADE {i}: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        else:
            print(f"---GRADE {i}: DOCUMENT NOT RELEVANT---")
            continue
    return {"documents": filtered_docs, "question": question}

def decide_to_generate(state):
    """
    Determines whether to generate an answer, or re-generate a question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    state["question"]
    filtered_documents = state["documents"]

    if not filtered_documents:
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print(
            "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---"
        )
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"
def generate(state: State):
    docs_content = ""
    for doc in state["context"]:
        docs_content += f"The following text is a chunk from document titled: {doc.metadata["Work"]}. There are a total of {doc.metadata["total_pages"]} pages in this play. \n"
        docs_content += doc.page_content + "\n\n"
    message = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(message)
    return {"answer": response.content}

# Set up langgraph. This is a much more complicated graph, so we set it up with individual nodes and edges.
from langgraph.graph import START, StateGraph

from langgraph.graph import END, StateGraph, START

workflow = StateGraph(State)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generate

# Build graph
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate
)
workflow.add_edge("generate", END)

# Compile
app = workflow.compile()

In [39]:
result = graph.invoke({"question": "What types of airs / melodies are present in dramatic situations of romance?"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: In the provided texts, several types of airs or melodies are present in dramatic situations of romance. Here are some examples:

1. **AIR du Fleuve de la vie** (from "Coraly"):
   - **Dramatic Situation**: Roland expresses his feelings for his beloved while contemplating the challenges posed by her family.
   - **Context**: This air reflects the inner turmoil and romantic longing of the character.

2. **AIR : Voulant par ses œuvres complètes** (from "Isle des noirs"):
   - **Dramatic Situation**: The characters discuss the effects of love and its potential to change people, highlighting the romantic theme.
   - **Context**: This air serves to convey the emotional weight of love and its consequences.

3. **AIR : Femmes, voulez-vous éprouver** (from "La Dette d’honneur"):
   - **Dramatic Situation**: Madame Delaunay reassures Pauline about her romantic troubles, emphasizing the ups and downs of love.
   - **Context**: This air captures the playful yet serious nature of romantic r