In [None]:
from bs4 import BeautifulSoup
import requests

In [53]:
browse = "https://miamiuniversityartmuseum.omeka.net/items/browse"

In [54]:
response = requests.get(browse)
if response.status_code != 200:
    print(f"Failed to retrieve data from the URL. Status code: {response.status_code}")
    exit()

soup = BeautifulSoup(response.text, 'html.parser')

In [55]:
link_on_pages = []
for link in soup.find_all('a'):

    href = link.get('href')
    if href and href.startswith('/items/show/'):
        #print(href)
        link_on_pages.append(href)

#print(link_on_pages)

In [56]:
soups = []

for link in link_on_pages:
    full_link = "https://miamiuniversityartmuseum.omeka.net" + link
    #print(full_link)
    response = requests.get(full_link)
    if response.status_code != 200:
        print(f"Failed to retrieve data from the URL. Status code: {response.status_code}")
        exit()

    soup = BeautifulSoup(response.text, 'html.parser')
    soups.append(soup)

In [57]:
def extract_data_from_soup(soup):
    data = {}

    # Extract fields using a loop
    fields = {
        'dublin-core-title': 'Title',
        'dublin-core-identifier': 'Identifier',
        'dublin-core-subject': 'Subject',
        'dublin-core-description': 'Description',
        'dublin-core-creator': 'Creator',
        'dublin-core-format': 'Format',
        'dublin-core-date': 'Date',
        'dublin-core-medium': 'Medium',
        'physical-object-item-type-metadata-donor': 'Donor',
        'item-citation': 'Citation'
    }

    for field_id, field_name in fields.items():
        element = soup.find('div', {'id': field_id})
        if element:
            data[field_name] = element.find('div', {'class': 'element-text'}).get_text(strip=True)

    # Extract tags
    data['Tags'] = [tag.get_text(strip=True) for tag in soup.find_all('a', {'rel': 'tag'})]

    # Extract image URL
    '''
    image_element = soup.find('div', {'id': 'item-images'})
    if image_element:
        data['Image URL'] = image_element.find('a')['href']
    '''

    # Extract collection link
    collection_element = soup.find('div', {'id': 'collection'})
    if collection_element:
        data['Collection Link'] = collection_element.find('a')['href']

    return data
    #print(f"{key}: {value}\n")


In [73]:
import json

# Initialize a list to store all extracted data
all_data = []

# Extract data from each soup and append to the list
for soup in soups:
    data = extract_data_from_soup(soup)
    all_data.append(data)

# Save the extracted data to a JSON file
with open("data/extracted_data.json", "w", encoding="utf-8") as json_file:
    json.dump(all_data, json_file, ensure_ascii=False, indent=4)

print("Data saved to data/extracted_data.json")

Data saved to data/extracted_data.json


In [59]:
import os
import json

# Ensure the "data" directory exists
os.makedirs("data", exist_ok=True)

# Save the extracted data to a JSON file
with open("data/extracted_data.json", "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print("Data saved to data/extracted_data.json")

Data saved to data/extracted_data.json


# LLM

In [60]:
from langchain_ollama import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings

model = OllamaLLM(model="llama3")
embeddings = OllamaEmbeddings(model="llama3")


In [61]:
# Pass the data dictionary directly to the model
response = model.invoke("what is the capital of france")
print(response)

The capital of France is Paris.


In [62]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("data/omeka_pdf.pdf")
pages = loader.load_and_split()
pages

[Document(metadata={'producer': 'Skia/PDF m135', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', 'creationdate': '2025-04-10T04:19:39+00:00', 'source': 'data/omeka_pdf.pdf', 'file_path': 'data/omeka_pdf.pdf', 'total_pages': 3, 'format': 'PDF 1.4', 'title': 'Pink Cone · Richard and Carole Cocks Art Museum at Miami University', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-10T04:19:39+00:00', 'trapped': '', 'modDate': "D:20250410041939+00'00'", 'creationDate': "D:20250410041939+00'00'", 'page': 0}, page_content='Skip to main content\nRichard and Carole Cocks Art Museum at Miami University\nMenu\nBrowse Exhibits\nBrowse Collections\nBrowse Items\nArt Museum Website\n \nSearch using this query type:\nKeyword\nBoolean\nExact match\nSearch only these record types:\n Item\n File\n Collection\n Simple Page\n Exhibit\n Exhibit Page\nAdvanced Search (Items only)\nPink Cone\nTitle\nPink Cone\nId

In [63]:
from langchain.prompts import PromptTemplate
template = """
You are a helpful assistant. You will be provided with a question and some context.
Please answer the question based on the context.
Context: {context}
Question: {question}

"""
prompt = PromptTemplate.from_template(template)

print(prompt.format(context="This is some context ", question="What is the capital of France?"))



You are a helpful assistant. You will be provided with a question and some context.
Please answer the question based on the context.
Context: This is some context 
Question: What is the capital of France?




In [64]:
chain = prompt | model

In [65]:
chain.invoke(
    {"context": "The name I was given was Ryan SIngh", 
    "question": "What is my name?"}
)


'According to the context, your name is Ryan Singh!'

In [66]:
from langchain_community.vectorstores import DocArrayInMemorySearch
vectorstore = DocArrayInMemorySearch.from_documents(
    documents=pages,
    embedding= embeddings,
)

In [67]:
retriever = vectorstore.as_retriever()
retriever.invoke("Art")

[Document(metadata={'producer': 'Skia/PDF m135', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', 'creationdate': '2025-04-10T04:19:39+00:00', 'source': 'data/omeka_pdf.pdf', 'file_path': 'data/omeka_pdf.pdf', 'total_pages': 3, 'format': 'PDF 1.4', 'title': 'Pink Cone · Richard and Carole Cocks Art Museum at Miami University', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-10T04:19:39+00:00', 'trapped': '', 'modDate': "D:20250410041939+00'00'", 'creationDate': "D:20250410041939+00'00'", 'page': 2}, page_content="Collection\nGifts of Jeffrey L. Horrell '75 and Rodney F. Rose\nTags\n20th Century, African Oceanic and New World Cultures, African Oceanic and New World Cultures-North\nAmerica, American, Cone, etching, Gampi paper chine colle, Graphic Arts, Graphic Arts-Prints, Hard\nground etching, Ice Cream, Line, Line etching, Lines, North American, Paper, Pink, Print, United States,\nWayne

In [68]:
from operator import itemgetter
chain = (
    {
        
    "context": itemgetter("context") | retriever,
    "question": itemgetter("question")

    } | prompt | model 

)


In [69]:
chain.invoke(
    {"context": "", 
    "question": "What is the name of the painting?"}
)

'According to the context, the name of the artwork is "Pink Cone". It\'s a print by Wayne Thiebaud (1920-2021), done through process of hard ground etching printed in pink on gampi paper chine colle.'

# JSON Doc Loading

In [70]:
    
response = model.invoke("What is the title of the first item in the JSON data?")
print(response)

I'm happy to help! However, I don't see any JSON data provided. Please share the JSON data, and I'll be happy to assist you in identifying the title of the first item.


In [71]:
# Load the JSON data from the saved file
with open("data/extracted_data.json", "r", encoding="utf-8") as json_file:
    json_data = json.load(json_file)

In [72]:
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_ollama.embeddings import OllamaEmbeddings

# Initialize embeddings
embeddings = OllamaEmbeddings(model="llama3")

from langchain.schema import Document

# Prepare documents from JSON data
# Ensure data is wrapped in a list if it's a single dictionary
documents = [Document(page_content=data["Description"], metadata=data)]

# Create a vector store
vectorstore = DocArrayInMemorySearch.from_documents(
    documents=documents,
    embedding=embeddings,
)

# Example: Retrieve relevant documents
retriever = vectorstore.as_retriever()