# Test Project: Haverford College Concert Programs
Small Sample

In [1]:
# Setting up chat model
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [2]:
# Setting up embeddings
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [3]:
# Setting up chroma
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

## Indexing

### Loading Files

In [4]:
# Setting up csv loading

from langchain_community.document_loaders.csv_loader import CSVLoader

def loadCSV(filepath: str) -> list:
    loader = CSVLoader(file_path=filepath, source_column="Filename",metadata_columns=["Category","Year","Term"])
    data = loader.load()
    return data


In [None]:
# Load CSV

csv_metadata = loadCSV("HC Concert Programs/Files/programs_metadata.csv")
print(csv_metadata[0].metadata['source'])
print(csv_metadata[1].metadata['Category'])

5. Orchestra Program Fall 2014.pdf
Orchestra


In [None]:
# Setting up pdf loading

from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import PDFPlumberLoader

# async def loadPDF(filepath: str) -> list:
#     loader = PyPDFLoader(filepath)
#     pages = []
#     async for page in loader.alazy_load():
#         pages.append(page)   
#     return pages

async def loadPDF(filepath: str) -> list:
    loader = PDFPlumberLoader(filepath)
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)   
    return pages


def get_files_from_directory(directory_path: str) -> list[str]:
    file_paths: list[str] = []
    for filename in os.listdir(directory_path):
        full_path = os.path.join(directory_path,filename)
        if os.path.isfile(full_path):
            file_paths.append(full_path)
    return file_paths


directory_path:str = "HC Concert Programs/Files/PDFs"
files: list[str] = get_files_from_directory(directory_path)

In [7]:
# Load PDFs w/out metadata
loaded_PDFs: list = []
for file in files:
    pages = await loadPDF(file)
    loaded_PDFs.append(pages)

In [8]:
# Check every PDFs loaded content
for i in range(len(loaded_PDFs)):
    document = loaded_PDFs[i]
    print(f"Document {i} \n")
    document_content: str = ""
    for document_page in document:
        document_content += document_page.page_content
    print(document_content + "\n \n")

Document 0 

Department of Music at Haverford College presents
Haverford-­‐Bryn Mawr Orchestra
FALL CONCERT
Heidi Jacob, conductor
Friday, November 21, 2014
8:00 p.m.
HAVERFORD COLLEGE – ROBERTS HALL, MARSHALL AUDITORIUM

Haverford-­‐Bryn Mawr College Orchestra
Fall Concert
Friday, November 21, 2014
Heidi Jacob, Conductor
Robert Hillinck, Conductor
Program
Overture to Idomeneo, K. 366 Wolfgang Amadeus Mozart
Robert Hillinck, Conductor (1756 – 1791)
Sinfonia in B flat major, Op. 18, No. 2 (Overture to Lucio Silla) Johann Christian Bach
Allegro assai (1735 – 1782)
Andante
Presto
Music for Solo Trumpet, Solo Percussion and Orchestra Charles Cacioppo
Scott Robinson, Percussion (b. 1983)
Charles Cacioppo, Trumpet
INTERMISSION
Romeo and Juliet Overture-­‐Fantasy in B minor (1880) Pyotr Ilyich Tchaikovsky
(1840 – 1893)
Robert (BJ) Hillinick: is a Senior at Haverford College pursuing a History major and a Music minor. In
addition to playing flute in the Bi-­‐Co Orchestra, he sings in the Chamb

In [None]:
# Cleaning the sources to match with the CSV source names
for source in loaded_PDFs:
    for page in source:
        page.metadata['source'] = page.metadata['source'].replace("HC Concert Programs/Files/PDFs/","")
        print(page.metadata['source'])

5. Orchestra Program Fall 2014.pdf
5. Orchestra Program Fall 2014.pdf
5. Orchestra Program Fall 2014.pdf
5. Orchestra Program Fall 2014.pdf
5. Orchestra Program Fall 2014.pdf
5. Orchestra Program Fall 2014.pdf
5. Orchestra Program Fall 2014.pdf
5. Philly Orch Winds Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
5. The Crossing Program.pdf
6. Chamber Orchestra First Editions Program Fall 2018.pdf
6. Chamber Orchestra First Editions Program Fall 2018.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2012.pdf
6. Chorale Program Fall 2014.pdf
6

#### Creating new Documents with CSV metadata + PDF content

In [10]:
# Creating new documents with PDF content and CSV metadata
from langchain.schema import Document
def create_documents_from_pdf_and_csv(pdf_pages: list, csv_metadata: list) -> list[Document]:
    documents = []
    for i in range(len(pdf_pages)):
        document_content = ""
        metadata = csv_metadata[i].metadata
        for page in pdf_pages[i]:
            assert page.metadata['source'] == csv_metadata[i].metadata['source'], "Source mismatch between PDF and CSV metadata"
            document_content += page.page_content
        
        document = Document(
            page_content=document_content,
            metadata={
                "source": pdf_pages[i][0].metadata['source'],
                "Category": metadata['Category'],
                "Year": metadata['Year'],
                "Term": metadata['Term']
            }
        )
        documents.append(document)
    return documents

In [11]:
docs = create_documents_from_pdf_and_csv(loaded_PDFs, csv_metadata)
print(f"Created {len(create_documents_from_pdf_and_csv(loaded_PDFs, csv_metadata))} documents from PDF and CSV metadata.")
print(docs[0])

Created 21 documents from PDF and CSV metadata.
page_content='Department of Music at Haverford College presents
Haverford-­‐Bryn Mawr Orchestra
FALL CONCERT
Heidi Jacob, conductor
Friday, November 21, 2014
8:00 p.m.
HAVERFORD COLLEGE – ROBERTS HALL, MARSHALL AUDITORIUM

Haverford-­‐Bryn Mawr College Orchestra
Fall Concert
Friday, November 21, 2014
Heidi Jacob, Conductor
Robert Hillinck, Conductor
Program
Overture to Idomeneo, K. 366 Wolfgang Amadeus Mozart
Robert Hillinck, Conductor (1756 – 1791)
Sinfonia in B flat major, Op. 18, No. 2 (Overture to Lucio Silla) Johann Christian Bach
Allegro assai (1735 – 1782)
Andante
Presto
Music for Solo Trumpet, Solo Percussion and Orchestra Charles Cacioppo
Scott Robinson, Percussion (b. 1983)
Charles Cacioppo, Trumpet
INTERMISSION
Romeo and Juliet Overture-­‐Fantasy in B minor (1880) Pyotr Ilyich Tchaikovsky
(1840 – 1893)
Robert (BJ) Hillinick: is a Senior at Haverford College pursuing a History major and a Music minor. In
addition to playing flut

### Chunking Files

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split 21 PDFs into {len(all_splits)} sub-documents.")

Split 21 PDFs into 372 sub-documents.


### Storing Documents

In [13]:
# If we run this again when reestablishing variables, it will duplicate documents and charge us


# document_ids = vector_store.add_documents(documents=all_splits) 
# print(document_ids[:3])

## Retrieval and Generation

In [None]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Use only the information provided in the context below to answer the question. If the answer is not in the context, say 'I don't know' or 'The information is not available.'"),
    ("human", "Context:\n{context}\n\nQuestion: {question}")
])

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"], k = 4)
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join([doc.page_content for doc in state["context"]])
    message = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(message)
    return {"answer": response.content}


In [15]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [22]:
print(graph.get_graph().draw_ascii())

+-----------+  
| __start__ |  
+-----------+  
      *        
      *        
      *        
+----------+   
| retrieve |   
+----------+   
      *        
      *        
      *        
+----------+   
| generate |   
+----------+   
      *        
      *        
      *        
 +---------+   
 | __end__ |   
 +---------+   


## Testing

In [None]:
# result = graph.invoke({"question": "Who played trumpet for the orchestra in 2019?"})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')

Context: [Document(id='f2c8b48a-c77a-4800-b8cf-c0efb9746d06', metadata={'Category': 'Orchestra', 'Year': '2014', 'start_index': 3401, 'Term': 'Fall', 'source': '5. Orchestra Program Fall 2014.pdf'}, page_content='VIOLIN Kendall Chambers ’18 HORN\nYue Yang, Co-\xad‐Concertmistress Kate Hutchison, Post Bac, BMC Abigail Healy ’15, Principal\nDora von Trentini, Co-\xad‐Concertmistress Katherine Allen ’17\nEsther Mildenhall, Co-\xad‐Associate CELLO Jonathan Cookmeyer ’17\nConcertmistress Samuel Walter Principal ’17 Kristina Gannon, Guest\nHelen Jung, Co-\xad‐Associate Concertmistress Sean Woodruff, Associate Principal\nKatherine Lee, Principal Second Violin Natalie Martin ’15, Assistant Principal TRUMPET\nOlivia DuSold, Associate Principal Olivia Rauss ’15 Kyle Albagli ’16, Principal\nMalia Wenny, Co-\xad‐Principal Second Xuenan Ni ’16 Jeffrey McGeehan ’17\nJennifer Jolivert, Associate Principal Chris Nagele ’16\nNicole Westerduin, Assistant Principal Sedi Agawu ’17 TROMBONE\nVanessa Felso 

In [None]:
# result = graph.invoke({"question": "How did the Haverford College Orchestra trumpet section change over the years?"})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')

Context: [Document(id='535848d2-7165-4fde-824a-93fdb9e9f265', metadata={'Term': 'Fall', 'Year': '2016', 'source': '5. Philly Orch Winds Program.pdf', 'start_index': 0, 'Category': 'Orchestra'}, page_content='The Department of Music at Haverford College hosts\nPhiladelphia Orchestra Audience Appreciation Concert\nTuesday, October 4, 2016\n7:30 PM\nHaverford College – Roberts Hall, Marshall Auditorium\nPerformers\nJonathan Blumenfeld HC ’78, oboe\nDavid Cramer, flute\nSam Caviezel, clarinet\nAngela Anderson Smith, bassoon\nJeffrey Lang, horn\nProgram\nKleine Kammermusik Op. 24, No. 2 (1922) Paul Hindemith\nI. Lustig. Mäßig schnell Viertel (1895 – 1963)\nII. Walzer: Durchweg sehr leise\nIII. Ruhig und einfach\nIV. Schnelle Viertel\nV. Sehr lebhaft\nPetite Offrande Musicale (1943) Nino Rota\n(1911 – 1979)\nPastorale for Wind Quintet, Op. 21 Vincent Persichetti\n(1915 – 1987)\nSuite for Winds No. 1, Op. 57 (1910) Charles -Édouard Lefebvre\nI. Moderato (1843 – 1917)\nII. Allegretto scherzand

In [None]:
# result = graph.invoke({"question": "What type of music does the Haverford-Bryn Mawr College Orchestra tend to play?"})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')

Context: [Document(id='615f666a-6652-41f6-ac08-fa6bf4bff46f', metadata={'start_index': 0, 'Category': 'Orchestra', 'source': '7. Orchestra Program Fall 2018.pdf', 'Term': 'Fall', 'Year': '2018'}, page_content='The Department of Music at Haverford College presents\nThe Haverford-Bryn Mawr College Orchestra\nHeidi Jacob, conductor\nFALL CONCERT\nFriday, November 16, 2018\n8:00 p.m.\nHaverford College – Roberts Hall, Marshall Auditorium'), Document(id='8f77066e-4f40-4cd9-b4c6-5db341fba6f5', metadata={'start_index': 0, 'source': '5. Orchestra Program Fall 2014.pdf', 'Term': 'Fall', 'Year': '2014', 'Category': 'Orchestra'}, page_content='Department of Music at Haverford College presents\nHaverford-\xad‐Bryn Mawr Orchestra\nFALL CONCERT\nHeidi Jacob, conductor\nFriday, November 21, 2014\n8:00 p.m.\nHAVERFORD COLLEGE – ROBERTS HALL, MARSHALL AUDITORIUM'), Document(id='535848d2-7165-4fde-824a-93fdb9e9f265', metadata={'Category': 'Orchestra', 'source': '5. Philly Orch Winds Program.pdf', 'Term'

In [None]:
# result = graph.invoke({"question": "Who has been featured in the most overall performances?"})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')

Context: [Document(id='95950e97-6555-4128-9589-6e2a92fd6062', metadata={'source': '7. Borromeo String Quartet Program.pdf', 'Category': 'Chamber', 'Term': 'Fall', 'Year': '2016', 'start_index': 16124}, page_content='Mexico, the Bermuda Festival of the Performing Arts, and the Terra di Siena Chamber Music Festival\nin Tuscany. The season welcomes multiple performances with clarinetist Richard Stoltzman, and\nspecial collaborations with the Bill T. Jones Dance Company, the Chicago Chamber Musicians,\nand also with cellist Antonio Lysy in a special multimedia production, Te Amo, Argentina.\nRecent highlights include a two-week residency at Suntory Hall in Tokyo to perform the complete\nBeethoven String Quartets, a cycle of Dvorak quartets at the Isabella Stewart Gardner Museum in\nBoston, the complete Bartok quartet cycle at the Curtis Institute of Music, performances at the\nInternational MIMO Festival in Brazil, the Morgan Library in New York, the Freer Gallery in\nWashington, D.C., and

In [None]:
# result = graph.invoke({"question": "Who is Heidi Jacob?"})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')

Context: [Document(id='8f77066e-4f40-4cd9-b4c6-5db341fba6f5', metadata={'start_index': 0, 'Term': 'Fall', 'source': '5. Orchestra Program Fall 2014.pdf', 'Category': 'Orchestra', 'Year': '2014'}, page_content='Department of Music at Haverford College presents\nHaverford-\xad‐Bryn Mawr Orchestra\nFALL CONCERT\nHeidi Jacob, conductor\nFriday, November 21, 2014\n8:00 p.m.\nHAVERFORD COLLEGE – ROBERTS HALL, MARSHALL AUDITORIUM'), Document(id='615f666a-6652-41f6-ac08-fa6bf4bff46f', metadata={'Category': 'Orchestra', 'source': '7. Orchestra Program Fall 2018.pdf', 'Term': 'Fall', 'start_index': 0, 'Year': '2018'}, page_content='The Department of Music at Haverford College presents\nThe Haverford-Bryn Mawr College Orchestra\nHeidi Jacob, conductor\nFALL CONCERT\nFriday, November 16, 2018\n8:00 p.m.\nHaverford College – Roberts Hall, Marshall Auditorium'), Document(id='c0adf302-f603-420a-b380-5a93df1525c2', metadata={'Year': '2012', 'Category': 'Cboral', 'source': '6. Chorale Program Fall 2012

In [None]:
# result = graph.invoke({"question": "Tell me about Andrew Cornell."})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')

Context: [Document(id='c80e1d70-3e6e-49b4-9559-7b0f32bd147b', metadata={'Term': 'Fall', 'Year': '2015', 'source': '7. Binchois Consort Program.pdf', 'start_index': 53210, 'Category': 'Choral'}, page_content='Oxford, where he began his lifelong exploration of early repertoires.\nHe has sung as a lay clerk at Westminster Cathedral, and with most of the early music vocal ensembles in the UK, in\naddition to his long-standing work with the Belgian Huelgas Ensemble. Concert performances and recordings have taken him\nthroughout Europe (Austria, Belgium, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Germany,\nGreece, Holland, Iceland, Italy, Lithuania, Poland, Portugal, Norway, Spain, Sweden, Switzerland) as well as to Israel,\nRussia, Canada, Australia, Japan and most states of the USA.\nAndrew Kirkman, conductor\nAndrew Kirkman studied at the universities of Durham, London (King’s College) and Princeton, and has worked at the\nuniversities of Manchester, Wales, Oxford

In [None]:
# result = graph.invoke({"question": "Tell me about the haverford-bryn mawr college orchestra trumpet player Andrew Cornell."})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')

Context: [Document(id='3ce2e8d2-1b3e-4fbe-9795-11d72a8659c9', metadata={'Year': '2019', 'Category': 'Orchestra', 'start_index': 782, 'source': '7. Orchestra Program Fall 2019.pdf', 'Term': 'Fall'}, page_content="3. Anitras dans (Anitra's Dance)\nHAVERFORD BRYN MAWR COLLEGE ORCHESTRA\nDiane Moore, HC ’20, Co-Concertmistress\nMeg Bowen, HC ’23, Co-Concertmistress\nLogan Chin, HC ’23, Co-Concertmaster\nJackie Toben, BMC ’22, Associate Principal First Violin\nEmilia Zegers Matthews, HC ’23, Principal Second Violin\nDevi Namboodiri, HC ’21, Co-Associate Second Violin\nKatherine Hahn, BMC ’23, Co-Associate Principal Second\nCodie Collins, HC ’21, Assistant Principal Second\nVIOLIN Erica Fenton, HC ’22 BASSOON\nDiane Moore, HC ’20 Jenna Loh, BMC ’23 Julie Clement, Guest\nEsther Kim, BMC ’20 Rosalie Tarsala, BMC ’23 Olivia Cleri, Guest\nLauren De La Ossa, HC ’20\nLeila Siegel, BMC ’20 STRING BASS HORN\nMarisa LaBarca, HC ’21 Hunter Logan, HC ’22 Anna Thompson, BMC ’22, Co-\nDevi Namboodiri, HC