# Dev Notebook

In [4]:
EMBEDDINGS_DIRECTORY = './vstore' # directory to store embeddings

In [None]:
import json
from langchain.vectorstores import FAISS
from langchain_nomic.embeddings import NomicEmbeddings
from langchain.schema import Document

In [6]:
with open("/Users/ryan/PycharmProjects/pythonProject/College/projects/rag_omeka/data/extracted_data.json") as file:
    data = json.load(file)

In [7]:
for item in data:
    desc = item.get('Description', '')
    print(f"Title: {item.get('Title', 'N/A')}, Description token length: {len(desc.split())}")

Title: Untitled III, Description token length: 49
Title: Gathering of the Clans, Description token length: 39
Title: Pink Cone, Description token length: 82
Title: Sunliners #7, Description token length: 76
Title: Van Ness, Santa Monica, Melrose, Description token length: 78
Title: Melrose, Market, Description token length: 78
Title: Untitled, Description token length: 30
Title: Desolate, Description token length: 75
Title: Ear 1968, Description token length: 55
Title: Palo Alto, Description token length: 57
Title: Untitled, Description token length: 35
Title: Untitled, Description token length: 35
Title: Manhattan, Description token length: 60
Title: Untitled, Description token length: 37
Title: The Nut Trees, Description token length: 74
Title: Child and Star, Description token length: 43
Title: Diver (Masthead Diver), Description token length: 47
Title: Untitled, Description token length: 33
Title: Pulsar, Description token length: 47
Title: Ten Winter Tools: Snips, Description toke

In [11]:
# Create documents
docs = []
for art in data:
    # Create a readable summary for embedding (page_content)
    page_content = (
        f"Title: {art.get('Title', 'Unknown')}\n"
        f"Creator: {art.get('Creator', 'Unknown')}\n"
        f"Date: {art.get('Date', 'Unknown')}\n"
        f"Medium: {art.get('Medium', 'Unknown')}\n"
        f"Format: {art.get('Format', 'Unknown')}\n"
        f"Subject: {art.get('Subject', 'Unknown')}\n"
        f"Description: {art.get('Description', 'No description provided.')}"
    )

    # Create metadata dictionary
    metadata = {
        "id": art.get("Identifier"),
        "title": art.get("Title"),
        "creator": art.get("Creator"),
        "date": art.get("Date"),
        "medium": art.get("Medium"),
        "format": art.get("Format"),
        "subject": art.get("Subject"),
        "donor": art.get("Donor"),
        "citation": art.get("Citation"),
        "tags": art.get("Tags", []),
        "collection_link": art.get("Collection Link")
    }

    docs.append(Document(page_content=page_content, metadata=metadata))

In [None]:
embedding_model = NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local")
vectorstore = FAISS.from_documents(
    documents=docs,
    embedding=embedding_model
)
# Persist the FAISS index to disk
vectorstore.save_local(EMBEDDINGS_DIRECTORY)

 # Create retriever
retriever = vectorstore.as_retriever(k=3)

Embedding texts: 100%|██████████| 99/99 [00:03<00:00, 28.40inputs/s]


In [14]:
retriever.get_relevant_documents("find photos") 

Embedding texts: 100%|██████████| 1/1 [00:00<00:00, 18.34inputs/s]


[Document(metadata={'id': '2019.13.38', 'title': None, 'creator': 'Felice Beato', 'date': '1860s', 'medium': 'Albumen photoprint', 'format': '9 1/4" x 11 1/2"', 'subject': 'Graphic Arts-Photos', 'donor': 'Miami University Art Museum Purchase', 'citation': 'Felice Beato, “[Untitled],”Richard and Carole Cocks Art Museum at Miami University, accessed May 23, 2025,https://miamiuniversityartmuseum.omeka.net/items/show/20613.', 'tags': [], 'collection_link': None}, page_content='Title: Unknown\nCreator: Felice Beato\nDate: 1860s\nMedium: Albumen photoprint\nFormat: 9 1/4" x 11 1/2"\nSubject: Graphic Arts-Photos\nDescription: Looking under an arch to another building, presumably a temple'),
 Document(metadata={'id': '2019.23.11', 'title': 'Searching for the Civil Rights Workers', 'creator': 'Steve Schapiro', 'date': '1964', 'medium': 'Silver Gelatin print', 'format': 'Paper Size: 15 3/4" x 19 11/16" (40 x 50 cm)', 'subject': 'Graphic Arts-photos', 'donor': 'Partial Gift of Stephen Schapiro an

In [16]:
retriever.invoke("Find me religious art century") # this should return 3 documents

Embedding texts: 100%|██████████| 1/1 [00:00<00:00, 25.05inputs/s]


[Document(id='b15c8f05-7eef-4e3a-9706-fa9a30c6dd01', metadata={'id': '2019.13.38', 'title': None, 'creator': 'Felice Beato', 'date': '1860s', 'medium': 'Albumen photoprint', 'format': '9 1/4" x 11 1/2"', 'subject': 'Graphic Arts-Photos', 'donor': 'Miami University Art Museum Purchase', 'citation': 'Felice Beato, “[Untitled],”Richard and Carole Cocks Art Museum at Miami University, accessed May 23, 2025,https://miamiuniversityartmuseum.omeka.net/items/show/20613.', 'tags': [], 'collection_link': None}, page_content='Title: Unknown\nCreator: Felice Beato\nDate: 1860s\nMedium: Albumen photoprint\nFormat: 9 1/4" x 11 1/2"\nSubject: Graphic Arts-Photos\nDescription: Looking under an arch to another building, presumably a temple'),
 Document(id='b8478d8b-3660-4760-a8d5-aa3a34146826', metadata={'id': '2019.13.35', 'title': 'Temples at Kamakura', 'creator': 'Felice Beato', 'date': '1860s', 'medium': 'Albumen photoprint', 'format': '9 3/8" x 11 1/2"', 'subject': 'Graphic Arts-Photos', 'donor': 