# Query Wikiart

A RAG pipeline using the wikiart-subjects dataset

In [1]:
import boto3
import os
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from langchain_aws.chat_models.bedrock import ChatBedrock
from langchain_aws.embeddings.bedrock import BedrockEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain_core.embeddings import Embeddings
from langchain_chroma import Chroma
from typing import List

load_dotenv()

True

In [2]:
def load_art_dataset(dataset_name: str, subset_size: int = None):
    """
    Load the dataset and prepare it for processing
    """
    dataset = load_dataset(dataset_name)
    df = dataset['train'].to_pandas()
    
    if subset_size:
        df = df.head(subset_size)
    
    # Create combined text field for embeddings
    df['combined_text'] = df.apply(
        # santize the `style` field, e.g. art-nouveau-modern > art nouveau modern
        lambda x: f"Description: {x['text']}\nStyle: {x['style'].replace('-', ' ')}", 
        axis=1
    )
    
    return dataset, df


def setup_aws_client(aws_access_key_id: str, aws_secret_access_key: str, region_name: str = 'us-east-1'):
    """
    Set up AWS session and client
    """
    session = boto3.Session(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )
    
    client = session.client('bedrock-runtime')
    return client


def setup_llm_and_embeddings(client):
    """
    Set up language model and embeddings using AWS Bedrock
    """
    
    # Using llama b/c I've been testing locally using llama3.2 via Ollama
    llm = ChatBedrock(
        client=client,
        model_id="anthropic.claude-3-5-sonnet-20240620-v1:0",
        model_kwargs={"temperature": 0.8}
    )
    
    # Using Titan for embeddings
    embeddings = BedrockEmbeddings(
        client=client,
        model_id="amazon.titan-embed-text-v1"
    )
    
    return llm, embeddings


def create_vectorstore(df: pd.DataFrame, embeddings: Embeddings) -> Chroma:
    """
    Create and populate the vector store
    """
    texts = df['combined_text'].tolist()

    metadatas = [
        {'id': str(i), 'style': style} 
        for i, style in enumerate(df['style'])
    ]
    
    vectorstore = Chroma(
        embedding_function=embeddings,
        persist_directory="./chroma"
    )

    current_records = vectorstore.get(include=[])
    current_ids = set(current_records["ids"])

    print(f"Number of records in vectorstore: {len(current_ids)}")

    values_to_add = [
        (text, metadata)
        for text, metadata in zip(texts, metadatas)
        if metadata["id"] not in current_ids
    ]

    if values_to_add:
        print(f"📀 Adding {len(values_to_add)} new records...")
        vectorstore.add_texts(
            texts=[text for text, _ in values_to_add],
            metadatas=[metadata for _, metadata in values_to_add],
            ids=[metadata["id"] for _, metadata in values_to_add]
        )
    else:
        print("🎉 No new records to add")

    
    return vectorstore


def setup_qa_chain(llm, vectorstore):
    """
    Set up the question-answering chain
    """
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(
            search_kwargs={"k": 5}
        ),
        return_source_documents=True
    )
    
    return qa_chain


def query_artwork(query: str, qa_chain, dataset, chat_history: List = None):
    """
    Query the artwork database and return results with images
    """
    if chat_history is None:
        chat_history = []
        
    enhanced_query = f"""
    Find artworks matching this query: {query}
    Focus on the style and description of the artworks.
    """
    
    response = qa_chain.invoke({
        "question": enhanced_query, 
        "chat_history": chat_history
    })
    
    # Get images for the retrieved documents
    retrieved_images = []
    for doc in response["source_documents"]:
        image_id = int(doc.metadata.get('id'))
        try:
            image = dataset['train'][image_id]['image']
            style = dataset['train'][image_id]['style']
            retrieved_images.append((image, style))
        except Exception as e:
            print(f"Error loading image {image_id}: {e}")
    
    return {
        "answer": response["answer"],
        "images": retrieved_images,
        "source_documents": response["source_documents"]
    }


def display_results(result):
    """
    Display the query results and images
    """
    print("Answer:", result["answer"])
    for img, _style in result["images"]:
        img.show()


In [3]:
# 1. Load the dataset
wikiart_dataset = "jlbaker361/wikiart-subjects"
dataset, df = load_art_dataset(wikiart_dataset, subset_size=500)  # Use small subset for testing

In [4]:
#2. Set up AWS credentials and client
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    
client = setup_aws_client(aws_access_key_id, aws_secret_access_key)

In [5]:
# 3. Set up the language model and embeddings
llm, embeddings = setup_llm_and_embeddings(client)

In [6]:
# 4. Create the vector store
vectorstore = create_vectorstore(df, embeddings)

Number of records in vectorstore: 0
📀 Adding 500 new records...


In [7]:
# 5. Set up the QA chain
qa_chain = setup_qa_chain(llm, vectorstore)

## Queries

### 1) Featuring women

In [8]:
query = "What is some art that features women?"
result = query_artwork(query, qa_chain, dataset)
display_results(result)

Answer: Based on the given context, here are some artworks that feature women, focusing on their style and description:

1. Style: Baroque
   Description: A painting of a woman and a man

2. Style: Northern Renaissance
   Description: A painting of a woman surrounded by a group of people

3. Style: Art Nouveau Modern
   Description: A painting of a woman laying on the ground

4. Style: Expressionism
   Description: A painting of a woman standing next to a man

5. Style: High Renaissance
   Description: A painting of a woman with a veil

These artworks span different artistic periods and styles, from Renaissance to Modern art, and feature women in various scenarios and compositions. The descriptions provide a brief glimpse into the subject matter of each painting, showing women in different contexts - alone, with others, or in specific poses or settings.


### 2) Religious art

In [9]:
query = "I'd like to see some religious art"
result = query_artwork(query, qa_chain, dataset)
display_results(result)

Answer: Based on the given context, here are some artworks that match your query for religious art, focusing on their style and description:

1. Style: Art Nouveau Modern
   Description: A painting of Jesus holding a scroll

2. Style: Expressionism
   Description: A painting of a church in the middle of a town

3. Style: Northern Renaissance
   Description: A drawing of a crowd in a church

4. Style: Northern Renaissance
   Description: The Adoration of the Cross, featuring multiple people (the description lists "person" multiple times)

5. Style: Northern Renaissance
   Description: A statue of an angel holding a book

These artworks all have religious themes or subjects, ranging from depictions of Jesus and angels to church settings and religious events like the Adoration of the Cross. The styles vary from Art Nouveau Modern to Expressionism, with a particular emphasis on Northern Renaissance style for three of the five artworks mentioned.


### 3) Renaissance art

In [10]:
query = "Paintings from the renaissance"
result = query_artwork(query, qa_chain, dataset)
display_results(result)

Answer: Based on the provided context, I can identify several paintings from the Renaissance period that match your query:

1. A painting described as "a painting of a man in a robe and hat" in the Early Renaissance style.

2. A painting depicting "two women in renaissance dress" in the Northern Renaissance style.

3. A painting showing "a woman and two men in a room" in the Early Renaissance style.

4. Two paintings described simply as "the painting of person" - one in the High Renaissance style and another in the Early Renaissance style.

These artworks showcase different aspects of Renaissance art, including portraiture and scenes with multiple figures. They also represent various sub-periods and regional styles within the Renaissance, such as Early Renaissance, High Renaissance, and Northern Renaissance. The descriptions, though brief, give us a sense of the clothing and settings typical of Renaissance paintings.


### 4) Expressionist art featuring nature

In [11]:
query = "I'd like to see some expressionist art that featrues nature, like animals or landscapes."
result = query_artwork(query, qa_chain, dataset)
display_results(result)

Answer: Based on the given context, I can suggest a few artworks that match your query for expressionist art featuring nature:

1. "A drawing of a mountain scene with a tree and mountains in the background"
   Style: Expressionism
   This artwork features a natural landscape with mountains and trees, fitting your request for nature-themed expressionist art.

2. "A painting of people walking through a forest"
   Style: Expressionism
   This piece includes a forest setting, which aligns with your interest in natural landscapes in expressionist art.

3. "A painting of a man and a dog in a field"
   Style: Expressionism
   This artwork features both an animal (a dog) and a natural setting (a field), matching your request for expressionist art with nature elements.

These three artworks are described as being in the expressionist style and feature various aspects of nature, including landscapes, forests, animals, and outdoor scenes. They should provide a good representation of expressionist