# Vaudeville RAG

Do not run all if the kernel restarts!

In [2]:
# Setting up chat model
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [3]:
# Setting up embeddings
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [4]:
# Setting up chroma
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

## Indexing

### Loading Files

In [5]:
# Setting up csv loading

from langchain_community.document_loaders.csv_loader import CSVLoader

def loadCSV(filepath: str) -> list:
    loader = CSVLoader(file_path=filepath, source_column="FileName",metadata_columns=["Year","Work","Theatre"])
    data = loader.load()
    return data


In [6]:
# Load CSV

csv_metadata = loadCSV(r"C:\Users\charl\Documents\VSCode\Vaudeville\Files\Vaudeville_Metadata.csv")
print(csv_metadata[0].metadata['source'])
print(csv_metadata[1].metadata['Theatre'])

As_tu_vu_la_comete_mon_gas_text.pdf
Théâtre du Vaudeville


In [7]:
# Setting up pdf loading

from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import PDFPlumberLoader


async def loadPDF(filepath: str) -> list:
    loader = PDFPlumberLoader(filepath)
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)   
    return pages


# from langchain.document_loaders import UnstructuredPDFLoader

# async def loadPDF(filepath: str) -> list:
#     loader = UnstructuredPDFLoader(
#     filepath,
#     mode="elements",  # or "single"
#     strategy="ocr_only"  # Forces it to OCR
#     )
#     pages = []
#     async for page in loader.alazy_load():
#         pages.append(page)   
#     return pages

In [8]:

def get_files_from_directory(directory_path: str) -> list[str]:
    directory = Path(directory_path)
    file_paths = [str(file) for file in directory.iterdir() if file.is_file()]
    return file_paths


directory_path:str = r"C:\Users\charl\Documents\VSCode\Vaudeville\Files\PDFs"
files: list[str] = get_files_from_directory(directory_path)

In [9]:
# Load PDFs w/out metadata
loaded_PDFs: list = []
for file in files:
    pages = await loadPDF(file)
    loaded_PDFs.append(pages)

In [10]:
# Check PDF content
for i in range(len(loaded_PDFs)):
    document = loaded_PDFs[i]
    print(f"## Document {i}: {document[0].metadata["source"][54:]} \n")
    document_content: str = ""
    for document_page in document:
        document_content += document_page.page_content
    print(document_content + "\n \n")

## Document 0: As_tu_vu_la_comete_mon_gas_text.pdf 










































































































 

## Document 1: Brutus_ou_le_dernier_soldat.pdf 

















































 

## Document 2: Clairville_Garet_Le_Palais_de_Chrysocale_(1855).pdf 

AVIS.Nulletraductiondecetouvragenepourraêtrefaitesans l'autorisationexpresse etparécrít
desauteursetdel'éditeur,quiseréserventenoutretouslesdroitsstipulésdanslesconventionsinter-
venuesou à intervenirentrelaFrance etlespays étrangers enmatièredepropriétélittéraire.
DIED
LE
PALAIS DE CHRYSOCALE
OU
LES EXPOSANTS ET LES EXPOSÉS
CONTRE-EXPOSITION DE L'EXPOSITION, MÉLÉE DE COUPLETS, EN DEUX TABLEAUX
Par MM. CLAIRVILLE et GABET
Représentée, pour la première fois, àParis, sur le théâtre des VARIÉTÉS
le 23 juillet 1855.
PERSONNAGES ACTEURS. 1
SOLIVEAU, vieuxrentieràCoucouron......
MM. F. HEUZEY.
GINSEPPE DAMASTOR, Italien, génie incompris............... AMBROISE.
MAXIME

In [11]:
# Check PDF metadata
for i in range(3):
    document = loaded_PDFs[i]
    print(f"Document {i} \n")
    document_metadata: list = []
    for document_page in document:
        document_metadata += document_page.metadata.items()
    print(document_metadata)

Document 0 

[('source', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('file_path', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('page', 0), ('total_pages', 104), ('CreationDate', "D:20130826123210+02'00'"), ('Creator', 'Bibliothèque nationale de France'), ('ModDate', "D:20250624190557-04'00'"), ('Producer', 'iText 1.4.8 (by lowagie.com)'), ('Title', "Cogniard, Théodore (1806-1872),Clairville (1811-1879). As-tu vu la comète, mon gas ? revue de l'année 1858, en 3 actes et 4 tableaux, par MM. Théodore Cogniard et Clairville... (Paris, Variétés, 30 décembre 1858.). 1859."), ('source', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('file_path', 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf'), ('page', 1), ('total_pages', 104), ('CreationDate', "D:20130826123210+02'00'")

So, some of this info is useful and some isn't.

We should keep:
* Source (but clean it so it's just the filename)
* Filepath
* Page
* Total pages
* Creator
* Title

We should remove:
* CreationDate (the date it was turned to pdf is not relevant)
* ModDate
* Producer

We also want to add the CSV metadata in as extra rows in the metadata dictionary

In [12]:
# Cleaning the sources to match with the CSV source names
for source in loaded_PDFs:
    for page in source:
        page.metadata['source'] = page.metadata['source'].replace("C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\PDFs\\","")
        print(page.metadata['source'])

As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_text.pdf
As_tu_vu_la_comete_mon_gas_t

  page.metadata['source'] = page.metadata['source'].replace("C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\PDFs\\","")


In [13]:
# Sorting both the CSV and PDF metadata to match
csv_metadata.sort(key=lambda x: x.metadata['source'])
loaded_PDFs.sort(key=lambda x: x[0].metadata['source'])

In [14]:
# Use assert to ensure the sources match
for i in range(32,len(csv_metadata)):
    assert csv_metadata[i].metadata['source'] == loaded_PDFs[i][0].metadata['source'], f"Mismatch at index {i}: {csv_metadata[i].metadata['source']} != {loaded_PDFs[i][0].metadata['source']}"

I had to change the lower index because there were various slight mismatches, but we've checked that all files are actually the same so we're moving on.

#### Cleaning PDF Metadata and Adding CSV Content

We should remove:
* CreationDate (the date it was turned to pdf is not relevant)
* ModDate
* Producer

In [15]:
print(len(csv_metadata))
print(len(loaded_PDFs))

35
35


In [16]:
irrelevant_metadata = ['CreationDate', 'ModDate', 'Producer']

for i in range(len(loaded_PDFs)):
    document = loaded_PDFs[i]
    for page in document:
        for key in irrelevant_metadata:
            if key in page.metadata:
                del page.metadata[key]
        # Add the category and year from the CSV metadata
        # assert page.metadata['source'] == csv_metadata[i].metadata['source'], "Source mismatch between PDF and CSV metadata"
        for row in csv_metadata[i].metadata.items():
            page.metadata.update({row[0]: row[1]})
        print(page.metadata)

{'source': 'As_tu_vu_la_comete_mon_gas_text.pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf', 'page': 0, 'total_pages': 104, 'Creator': 'Bibliothèque nationale de France', 'Title': "Cogniard, Théodore (1806-1872),Clairville (1811-1879). As-tu vu la comète, mon gas ? revue de l'année 1858, en 3 actes et 4 tableaux, par MM. Théodore Cogniard et Clairville... (Paris, Variétés, 30 décembre 1858.). 1859.", 'row': 0, 'Year': '1858', 'Work': 'As-tu vu la comète, mon gas?', 'Theatre': 'Théâtre des Variétés'}
{'source': 'As_tu_vu_la_comete_mon_gas_text.pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\As_tu_vu_la_comete_mon_gas_text.pdf', 'page': 1, 'total_pages': 104, 'Creator': 'Bibliothèque nationale de France', 'Title': "Cogniard, Théodore (1806-1872),Clairville (1811-1879). As-tu vu la comète, mon gas ? revue de l'année 1858, en 3 actes et 4 tableaux, par MM. Théodore Cogniard et Clairvill

In [17]:
# Convert each sublist into a single document
from langchain.schema import Document
def convert_list_to_document(pages: list) -> Document:
    document_content: str = ""
    for page in pages:
        document_content += page.page_content
    document: Document = Document(
        page_content=document_content,
        metadata=pages[0].metadata  # Use the metadata from the first page
    )
    return document

# Convert loaded PDFs to documents
docs = []
for source in loaded_PDFs:
    doc = convert_list_to_document(source)
    docs.append(doc)  # Append the single Document object

In [18]:
print(docs[2])

page_content='AVIS.Nulletraductiondecetouvragenepourraêtrefaitesans l'autorisationexpresse etparécrít
desauteursetdel'éditeur,quiseréserventenoutretouslesdroitsstipulésdanslesconventionsinter-
venuesou à intervenirentrelaFrance etlespays étrangers enmatièredepropriétélittéraire.
DIED
LE
PALAIS DE CHRYSOCALE
OU
LES EXPOSANTS ET LES EXPOSÉS
CONTRE-EXPOSITION DE L'EXPOSITION, MÉLÉE DE COUPLETS, EN DEUX TABLEAUX
Par MM. CLAIRVILLE et GABET
Représentée, pour la première fois, àParis, sur le théâtre des VARIÉTÉS
le 23 juillet 1855.
PERSONNAGES ACTEURS. 1
SOLIVEAU, vieuxrentieràCoucouron......
MM. F. HEUZEY.
GINSEPPE DAMASTOR, Italien, génie incompris............... AMBROISE.
MAXIME, petit cousin de madame Soliveau.....
ARMAND-POTEL,
CANICHON, paysan........ .... DELIÈRE.
UN INVENTEUR....
AMÉDÉE.
UN MONSIEUR. OULIF.
UN DEUXIÈME MONSIEUR.... PELLERIN.
UN MARCHAND DE GAUFFRES. POULAIN.
UN MARCHAND DE COCO..... FRANGIN.
FRÉDÉGONDE, Française......... MilesPOTEL.
PALMYRE, femme de Soliveau.... GE

In [19]:
docs.pop(0)  # Remove the first document because it rendered as empty
docs.pop(0)  # Remove the second document because it rendered as empty

Document(metadata={'source': 'Brutus_ou_le_dernier_soldat.pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\Brutus_ou_le_dernier_soldat.pdf', 'page': 0, 'total_pages': 47, 'row': 1, 'Year': '1843', 'Work': 'Brutus ou le dernier soldat du guet', 'Theatre': 'Théâtre du Vaudeville'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')

In [20]:
# Double check metadata
for i in range(0,10):
    print(f"Document {i} Metadata: {docs[i].metadata}")

Document 0 Metadata: {'source': 'Clairville_Garet_Le_Palais_de_Chrysocale_(1855).pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\Clairville_Garet_Le_Palais_de_Chrysocale_(1855).pdf', 'page': 0, 'total_pages': 14, 'Author': 'Clairville (M.)', 'Title': 'Le palais de chrysocale ou les exposants et les exposés', 'row': 2, 'Year': '1855', 'Work': 'Le Palais de Chrysocale', 'Theatre': 'Théâtre des Variétés'}
Document 1 Metadata: {'source': 'Clairville_Thiboust_Le_quart_du_monde_(1855).pdf', 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\Clairville_Thiboust_Le_quart_du_monde_(1855).pdf', 'page': 0, 'total_pages': 10, 'Author': 'Lambert-Thiboust (1826-1867)', 'Creator': 'Bibliothèque nationale de France', 'Title': 'Le Quart de monde, ou le Danger d\'une particulière pleine de malice pour un individu vraiment impressionnable (parodie du "Demi-monde"), étude réaliste mêlée de couplets et d\'effets de style, par MM. Clairville et Lamb

### Chunking Files

In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # chunk size (characters)
    chunk_overlap=400,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split {len(docs)} PDFs into {len(all_splits)} sub-documents.")

Split 33 PDFs into 1415 sub-documents.


### Embedding and adding to the vector database

In [22]:
print(len(all_splits))

1415


In [23]:
# If we run this again when reestablishing variables, it will duplicate documents and charge us

# Splitting into three chunks to avoid overloading the vector store
# vector_store.add_documents(documents=all_splits[:350])
# vector_store.add_documents(documents=all_splits[350:700])
# vector_store.add_documents(documents=all_splits[700:1050])
# vector_store.add_documents(documents=all_splits[1050:])

## Setting Up Retrieval and Generation w/ Filters

In [None]:
# Setting up chat model
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o", model_provider="openai")

# Setting up embeddings
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Reestablishing up our persist directory for Chroma
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where we stored our data before
)

# Reestablishing retrieval and generation functions
from langchain_core.documents import Document
from typing_extensions import List, TypedDict, Optional
from langchain_core.prompts import ChatPromptTemplate

# prompt = ChatPromptTemplate.from_messages([
#     ("system", 'You are an expert in French musical theatre and opera of the 19th century.  You will help us explore the texts (librettos) of vaudevilles, which are dramatic productions that include periodic musical numbers, which might be songs, choruses, instrumental dances, and other sonic events. \n '
#     'Many of these musical moments reuse some preexisting (and often well-known)  melody or tune.  These are variously called “melodie”, or “air”, and identified with a short title that refers in some way to an opera or collection of melodies from which it was drawn.  The titles might include the names of works, or other characters in those original works. '
#     '\n In the context of the plays, these tunes become the vehicle for newly composed lyrics, which are normally rhymed, and which normally follow the poetic scansion and structure of the original lyrics.  Rhyme, versification and structure are thus of interest to us. \n Your task is to identify all the musical situations in each play, reporting the following details in a structured form:'
#     '-the act and scene in which the event takes place'
#     '-the character or characters who are singing (or otherwise making music),'
#     '-the dramatic situation (a love scene, a crowd scene)'
#     '-the name of the air or melodie'
#     '-the poetic text'
#     '-the rhyme scheme'
#     '-form of the poetic text, which might involve some refrain'
#     '-the end accent for each line (masculine or féminine)'
#     '-syllable count for each line'
#     '-any irregularities you notice'
#     'if you cannot find the answers for anything within the context provided, state "I dont know" or similar.\n\n'),
#     ("human", "Context:\n{context}\n\nQuestion: {question}")
# ])

prompt = ChatPromptTemplate.from_messages([
    ("system", 'You are an expert in French musical theatre and opera of the 19th century.  You will help us explore the texts (librettos) of vaudevilles, which are dramatic productions that include periodic musical numbers, which might be songs, choruses, instrumental dances, and other sonic events. \n '
    'Many of these musical moments reuse some preexisting (and often well-known)  melody or tune.  These are variously called “melodie”, or “air”, and identified with a short title that refers in some way to an opera or collection of melodies from which it was drawn.  The titles might include the names of works, or other characters in those original works. '
    '\n In the context of the plays, these tunes become the vehicle for newly composed lyrics, which are normally rhymed, and which normally follow the poetic scansion and structure of the original lyrics.  Rhyme, versification and structure are thus of interest to us. \n Keep the following attributes in mind as you answer questions based on the texts provided:'
    '-the act and scene in which the event takes place'
    '-the character or characters who are singing (or otherwise making music),'
    '-the dramatic situation (a love scene, a crowd scene)'
    '-the name of the air or melodie'
    '-the poetic text'
    '-the rhyme scheme'
    '-form of the poetic text, which might involve some refrain'
    '-the end accent for each line (masculine or féminine)'
    '-syllable count for each line'
    '-any irregularities you notice'
    'If you cannot find the answers for anything within the context provided, state "I dont know" or similar - do not make assumptions beyond what is actually in the context.\n\n'),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])

class State(TypedDict):
    question: str
    filter: Optional[dict]
    context: List[Document]
    answer: str

def apply_filter(state: State):
    if "filter" in state and state["filter"]:
        state["filter"] = {k: v for k, v in state["filter"].items() if v is not None}
    else:
        state["filter"] = None
    return state

def retrieve(state: State):
    filter_dict = state["filter"] if state.get("filter") else None
    retrieved_docs = vector_store.similarity_search(state["question"], k=15, filter=filter_dict)
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = ""
    for doc in state["context"]:
        docs_content += f"The following text is a chunk from document titled: {doc.metadata["Work"]}. There are a total of {doc.metadata["total_pages"]} pages in this play. \n"
        docs_content += doc.page_content + "\n\n"
    message = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(message)
    return {"answer": response.content}

# Reestablishing langgraph
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([apply_filter, retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [34]:
docs[1].metadata

{'source': 'Clairville_Thiboust_Le_quart_du_monde_(1855).pdf',
 'file_path': 'C:\\Users\\charl\\Documents\\VSCode\\Vaudeville\\Files\\PDFs\\Clairville_Thiboust_Le_quart_du_monde_(1855).pdf',
 'page': 0,
 'total_pages': 10,
 'Author': 'Lambert-Thiboust (1826-1867)',
 'Creator': 'Bibliothèque nationale de France',
 'Title': 'Le Quart de monde, ou le Danger d\'une particulière pleine de malice pour un individu vraiment impressionnable (parodie du "Demi-monde"), étude réaliste mêlée de couplets et d\'effets de style, par MM. Clairville et Lambert ...',
 'row': 3,
 'Year': '1855',
 'Work': 'Le Quart de Monde',
 'Theatre': 'Théâtre des Variétés'}

In [29]:
for i, doc in enumerate(docs):
    print(f"{i}: {doc.metadata['source']}")

0: Clairville_Garet_Le_Palais_de_Chrysocale_(1855).pdf
1: Clairville_Thiboust_Le_quart_du_monde_(1855).pdf
2: Coraly_ou_la_soeur_et_le_Frere.pdf
3: D_Aranda_ou_les_grandes_passions_comédi_p_19.pdf
4: Dupin_Dumanoir_La_toque_bleue_(1834).pdf
5: Duval_Les_artistes_par_occasion_(1807).pdf
6: L_apprenti_ou_l_art_de_faire_une_maître.pdf
7: L_isle_des_noirs_ou_Les_deux_ingénues.pdf
8: La_Dette_d_honneur.pdf
9: La_favorite_comedie_vaudeville_en_1_acte.pdf
10: La_grisette_et_l_héritière.pdf
11: Labiche_Rue_de_l'Homme_armé_(1849).pdf
12: Le_Baril_d_olives_comédie-vaudeville.pdf
13: Le_Mari_par_Intérim_Comédie_vaudeville.pdf
14: Le_Singe_et_l_Adjoint_folie_vaudeville.pdf
15: Les_Modistes_tableau-vaudeville_2.pdf
16: Les_ensorcelés_ou_les_amans_ignorans.pdf
17: Les_maris_sans_femmes.pdf
18: Ohé_Les_p_tits_agneaux_revue.pdf
19: Paris_voleur.pdf
20: Revel_=_Le_Fifre_du_roi_de_Prusse,_ou_les_Prisonniers_de_Spandau_(1818).pdf
21: Sans_tambour_ni_trompette_with_lyas_as_on_noffense_pas_une_b

In [None]:
def analyze_play(i: int):

    if i < 0 or i >= len(docs):
        raise ValueError("Index out of range for the documents list.")
    
    # Apply the graph to the state
    message = prompt.invoke({"question": "Analyze this play according to the system prompt.", "context": docs[i].page_content})
    return llm.invoke(message).content


## Testing

In [14]:
result = graph.invoke({"question": "Tell me about the elements of Un Docteur en Herbe"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: Based on the provided text from "Un Docteur en herbe," here is a structured summary of the musical moments and other relevant details:

### Act I

**Scene I**
- **Characters**: Derbigny, Isidore, Lambert
- **Dramatic Situation**: Preparations for an exam; father is urging his son to wake up.
- **Musical Element**: None noted in this scene.

**Scene II**
- **Characters**: Isidore, Derbigny, Lambert
- **Dramatic Situation**: Conversation about Isidore's preparation and attire.
- **Musical Element**: None noted in this scene.

**Scene III**
- **Characters**: Isidore, Lambert
- **Dramatic Situation**: Discussion about Isidore's lack of interest in law and preference for painting.
- **Musical Element**: None noted in this scene.

---

### Act II

**Scene I**
- **Characters**: Isidore, Lambert, Delapierre Meulière
- **Dramatic Situation**: Isidore elated about developments with Pauline.
- **Musical Element**:
  - **Name of the Air**: "Ne vois-tu pas, jeune imprudent"
  - **Poetic Tex

In [37]:
result = graph.invoke({"question": "Tell me about the elements of Un Docteur en Herbe",
                       "filter": {"Work": "Un Docteur en herbe"}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: "Un Docteur en Herbe" is a 19th-century French vaudeville in two acts. Here are the elements based on the text provided:

**Act and Scene**: The play consists of two acts. Specific scenes mentioned in the provided text include Act I Scenes 1, 2, 3, 12, 13, and Act II Scene 15.

**Characters/Singing**:
1. **Derbigny** - Portrayed as an older character (50 years old) interacting with Isidore and Lambert.
2. **Isidore** - Derbigny's son, a law student, who is portrayed as naive and somewhat scatterbrained.
3. **Lambert** - A young doctor, involved in conversation with Derbigny and Isidore, and is characterized by a focus on balance between work and pleasure.
4. **Delapierre Meulière** - Refers to Lambert as a friend; there's some dramatic tension with Isidore involving his daughter.
5. **Pauline** - Derbigny’s daughter, appears in interactions with Isidore and Delapierre Meulière.
6. **Paméla** - A grisette, involved in scenes with Isidore and Lambert.

**Dramatic Situation**: 
- 

In [39]:
result = graph.invoke({"question": "Compare Un Docteur en Herbe to Les Artistes par occasion.",
                       "filter": {"$or": [{"Work": "Un Docteur en herbe"},{"Work": "Les Artistes par occasion"}]}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: The two plays, "Un Docteur en Herbe" by Duvert et Lausanne and "Les Artistes par Occasion" by Alexandre Duval, offer an insightful glimpse into 19th-century French vaudeville. Below is a comparative analysis based on the available text excerpts.

**Act and Scene Structure:**
- "Un Docteur en Herbe" is structured in at least two acts with multiple scenes, involving a diverse set of characters within different locations like a hotel in Paris and a house in Briare. The excerpts suggest a rich narrative with tight interactions among characters in different settings.
- "Les Artistes par Occasion" is a single-act play set in a garden in Tivoli, near Rome, offering a continuous flow without the segmentation into multiple acts as seen in "Un Docteur en Herbe."

**Characters and Roles:**
- "Un Docteur en Herbe" features characters such as Derbigny, Isidore, Lambert, Pauline, Pamela, and others, engaging in multi-layered relationships, often revolving around family dynamics, profession, 

In [40]:
result = graph.invoke({"question": "Compare the elements of Un Docteur en Herbe to Les Artistes par occasion.",
                       "filter": {"$or": [{"Work": "Un Docteur en herbe"},{"Work": "Les Artistes par occasion"}]}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: When comparing the elements of "Un Docteur en Herbe" to "Les Artistes par occasion," several aspects are noticeable:

1. **Dramatic Structure:**
   - **Un Docteur en Herbe**: Structured as a comedy-vaudeville in two acts, involving multiple characters and detailed scenes with numerous interactions.
   - **Les Artistes par occasion**: A comedy with elements of ariettes (singing parts), presented in one act, typically showing a lighter and straightforward storyline.

2. **Setting:**
   - **Un Docteur en Herbe**: Its setting transitions from an inn in Paris to the house of Derbigny in Briare, focusing on indoor, familial, and academic environments. 
   - **Les Artistes par occasion**: Set in a picturesque garden in Tivoli, near Rome, offering an outdoor setting associated with art and leisure.

3. **Characters:**
   - **Un Docteur en Herbe**: Features characters like Derbigny, Isidore, Lambert, and Pauline, focusing on familial relationships and professional futures with a comedic

In [8]:
result = graph.invoke({"question": "How do the poetic elements of Un Docteur en Herbe compare to other works?"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: The poetic elements of "Un Docteur en Herbe" can be compared to other 19th-century French vaudevilles and operettas in various ways, particularly in terms of rhyme scheme, versification, structure, and thematic elements. Here are some points of comparison:

1. **Rhyme Scheme and Meter:**
   - In "Un Docteur en Herbe," the use of regular rhyme schemes (often alternating) is typical of the genre, which emphasizes musicality and the ease of memorization for the audience. For example, the song "Moifumer?paslemoinsdumonde" often follows an ABAB rhyme scheme.
   - This approach mirrors that of other contemporaneous works such as Jacques Offenbach’s operettas, which also employed accessible and catchy verse structures, enhancing their appeal to a wide audience.

2. **Syllable Count and Scansion:**
   - The texts in "Un Docteur en Herbe" typically feature octo-syllabic or decasyllabic lines, creating a rhythmic flow that suits the comedic and often lighthearted nature of the narrative.

In [11]:
result = graph.invoke({"question": "What airs / melodies are present in Les Modistes?", "filter": {"Work": "Les Modistes"}})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: "Les Modistes" is a comical operetta by Hervé, which premiered in 1872. It tells the story of a group of fashionable milliners and the romantic entanglements surrounding them. In examining the musical moments in "Les Modistes," here's a breakdown of the relevant details regarding the airs/mélodies present in the piece:

### Act 1

#### Scene 1
- **Character(s)**: Léon, a wealthy suitor, and the milliners
- **Dramatic Situation**: A lively introduction as Léon meets the milliners and admires their work
- **Name of the Air or Mélodie**: "Les petits mousses" (a well-known air from "Les Huguenots" by Meyerbeer)
- **Poetic Text**: Celebratory lyrics about fashion and beauty
- **Rhyme Scheme**: AABB
- **Form of the Poetic Text**: Strophic with a refrain
- **End Accent for Each Line**: Alternating feminine and masculine accents
- **Syllable Count**: Lines typically range from 8 to 12 syllables
- **Irregularities**: Some lines may extend beyond typical syllable patterns for added empha

In [12]:
result = graph.invoke({"question": "What types of airs / melodies are present in dramatic situations of romance?"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

Answer: In the context of the 19th-century French vaudeville and opera, various types of airs or melodies are present in dramatic situations of romance. Below are some examples identified from the provided texts, detailing the specific dramatic situations where romance plays a crucial role. 

### Examples of Musical Situations in Romantic Contexts

1. **Title:** Air du Fleuve de la vie  
   - **Act & Scene:** Not specified in the excerpt (likely a romantic exchange).  
   - **Characters Singing:** Roland.  
   - **Dramatic Situation:** Expression of romantic feelings, addressing a love interest.  
   - **Melodie:** "Air du Fleuve de la vie."  
   - **Poetic Text:** 
     ```
     Saisis d'une frayeur nouvelle,  
     Je tremble, et ne lui parle point;  
     Qu'elle est belle!... et pourtant sur elle  
     Tu l'emportes en un seul point.  
     ```
   - **Rhyme Scheme:** AABB  
   - **Form:** Quatrains (4 lines).  
   - **End Accent:** 1 feminine, 3 masculine.  
   - **Syllable Count:

## Fixes

In [3]:
# Setting up chat model
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o", model_provider="openai")

# Setting up embeddings
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Reestablishing up our persist directory for Chroma
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where we stored our data before
)

# Reestablishing retrieval and generation functions
from langchain_core.documents import Document
from typing_extensions import List, TypedDict, Optional
from langchain_core.prompts import ChatPromptTemplate

# prompt = ChatPromptTemplate.from_messages([
#     ("system", 'You are an expert in French musical theatre and opera of the 19th century.  You will help us explore the texts (librettos) of vaudevilles, which are dramatic productions that include periodic musical numbers, which might be songs, choruses, instrumental dances, and other sonic events. \n '
#     'Many of these musical moments reuse some preexisting (and often well-known)  melody or tune.  These are variously called “melodie”, or “air”, and identified with a short title that refers in some way to an opera or collection of melodies from which it was drawn.  The titles might include the names of works, or other characters in those original works. '
#     '\n In the context of the plays, these tunes become the vehicle for newly composed lyrics, which are normally rhymed, and which normally follow the poetic scansion and structure of the original lyrics.  Rhyme, versification and structure are thus of interest to us. \n Your task is to identify all the musical situations in each play, reporting the following details in a structured form:'
#     '-the act and scene in which the event takes place'
#     '-the character or characters who are singing (or otherwise making music),'
#     '-the dramatic situation (a love scene, a crowd scene)'
#     '-the name of the air or melodie'
#     '-the poetic text'
#     '-the rhyme scheme'
#     '-form of the poetic text, which might involve some refrain'
#     '-the end accent for each line (masculine or féminine)'
#     '-syllable count for each line'
#     '-any irregularities you notice'
#     'if you cannot find the answers for anything within the context provided, state "I dont know" or similar.\n\n'),
#     ("human", "Context:\n{context}\n\nQuestion: {question}")
# ])

prompt = ChatPromptTemplate.from_messages([
    ("system", 'You are an expert in French musical theatre and opera of the 19th century.  You will help us explore the texts (librettos) of vaudevilles, which are dramatic productions that include periodic musical numbers, which might be songs, choruses, instrumental dances, and other sonic events. \n '
    'Many of these musical moments reuse some preexisting (and often well-known)  melody or tune.  These are variously called "melodie", or "air", and identified with a short title that refers in some way to an opera or collection of melodies from which it was drawn.  The titles might include the names of works, or other characters in those original works. '
    '\n In the context of the plays, these tunes become the vehicle for newly composed lyrics, which are normally rhymed, and which normally follow the poetic scansion and structure of the original lyrics.  Rhyme, versification and structure are thus of interest to us. If there is a mention of "air" or "melodie", look for the tune that the section is sung to, which will be labeled as "air" or "melodie" within the text.\n'
    'If you cannot find the answers for anything within the context provided, state "I dont know" or similar - do not make assumptions beyond what is in the context. Please always mention which play you are referencing in each section of your response, if multiple. The plays are labeled at the top of each source. If you can tell what act / scene and/or page a source is from, cite it too.\n\n'),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])

class State(TypedDict):
    question: str
    filter: Optional[dict]
    context: List[Document]
    answer: str

def apply_filter(state: State):
    if "filter" in state and state["filter"]:
        state["filter"] = {k: v for k, v in state["filter"].items() if v is not None}
    else:
        state["filter"] = None
    return state

def retrieve(state: State):
    filter_dict = state["filter"] if state.get("filter") else None
    retrieved_docs = vector_store.similarity_search(state["question"], k=15, filter=filter_dict)
    return {"context": retrieved_docs}


retriever = vector_store.as_retriever()

### Retrieval Grader


from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from pydantic import BaseModel, Field


# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


# LLM with function call
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with only filtered relevant documents
    """

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["context"]

    # Score each doc
    filtered_docs = []
    for i,d in enumerate(documents):
        score = retrieval_grader.invoke(
            {"question": question, "document": d.page_content}
        )
        grade = score.binary_score
        if grade == "yes":
            print(f"---GRADE {i}: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        else:
            print(f"---GRADE {i}: DOCUMENT NOT RELEVANT---")
            continue
    return {"documents": filtered_docs, "question": question}

def decide_to_generate(state):
    """
    Determines whether to generate an answer, or re-generate a question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    state["question"]
    filtered_documents = state["documents"]

    if not filtered_documents:
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print(
            "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---"
        )
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"
def generate(state: State):
    docs_content = ""
    for doc in state["context"]:
        docs_content += f"The following text is a chunk from document titled: {doc.metadata["Work"]}. There are a total of {doc.metadata["total_pages"]} pages in this play. \n"
        docs_content += doc.page_content + "\n\n"
    message = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(message)
    return {"answer": response.content}

# Reestablishing langgraph
from langgraph.graph import START, StateGraph

from langgraph.graph import END, StateGraph, START

workflow = StateGraph(State)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generate

# Build graph
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate
)
workflow.add_edge("generate", END)

# Compile
app = workflow.compile()

In [72]:
result = graph.invoke({"question": "What types of airs / melodies are present in dramatic situations of romance?"})

print(f'Answer: {result["answer"]}')
print("\n\nSources:")
for i, source in enumerate(result["context"]):
    print(f'Source {i+1}: {source.metadata["source"]}')

---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
Answer: In the provided excerpts from various plays, several types of airs or melodies are associated with dramatic situations of romance. Here are the notable examples:

### 1. **Coraly**
- **AIR du Fleuve de la vie**: This air is used by Roland to express his feelings about his beloved, indicating a romantic tension as he reflects on his emotions and the beauty of the woman he admires.

### 2. **Le Fifre du roi de Prusse, ou les Prisonniers de Spandau**
