In [1]:
import pandas as pd
import numpy as np
import requests
import os
import json  


In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file


# Project Description

### 1.- Defining Variables

The cell below contains code that searches for an artist using different resources of the Harvard Art Museum API - 

In [3]:
key  = os.environ['HARVARD_API_KEY']
artist = "Cezanne"



The cell below will use the title to find an object with the artist name

In [4]:
r = requests.get(f'https://api.harvardartmuseums.org/object?person={artist}&apikey={key}')
data = r.json()

In [5]:
# Extract the info and records
info = data['info']
records = data['records']

In [15]:
for i in records:
    print(i)
    


{'copyright': None, 'contextualtextcount': 0, 'creditline': 'Harvard Art Museums/Fogg Museum, Bequest of Mary Gershinowitz', 'accesslevel': 1, 'dateoflastpageview': '2023-11-13', 'classificationid': 21, 'division': 'European and American Art', 'markscount': 11, 'publicationcount': 2, 'totaluniquepageviews': 474, 'contact': 'am_europeanamerican@harvard.edu', 'colorcount': 3, 'rank': 124223, 'id': 97993, 'state': None, 'verificationleveldescription': 'Best. Object is extensively researched, well described and information is vetted', 'period': None, 'images': [{'date': '2018-08-20', 'copyright': 'President and Fellows of Harvard College', 'imageid': 491094, 'idsid': 457568629, 'format': 'image/jpeg', 'description': None, 'technique': 'Make:Hasselblad;Model:Hasselblad H5D-50c MS;Orientation:1;Software:Adobe Photoshop CS6 (Macintosh);', 'renditionnumber': '768158', 'displayorder': 1, 'baseimageurl': 'https://nrs.harvard.edu/urn-3:HUAM:768158', 'alttext': None, 'width': 2550, 'publiccaption'

## Results with Provenance

The code below will provide you with a dataframe (like an excel file with the artist information)

In [6]:
artist = "Cezanne"

In [7]:
def results_provenance(url, df=pd.DataFrame()):
    r = requests.get(url)
    data = r.json()
    info = data['info']
    records = data['records']
    
    # Extract relevant data from each record
    record_data = []
    for record in records:
        record_data.append({
            'title': record['title'],
            'classification': record['classification'],
            'century': record['century'],
            'provenance': record['provenance']
        })
    
    # Append record data to DataFrame
    df = pd.concat([df, pd.DataFrame(record_data)], ignore_index=True)
    
    # Recursively call pagination function for next page, if it exists
    if 'next' in info:
        return results_provenance(info['next'], df)
    else:
        return df

In [8]:
url = f'https://api.harvardartmuseums.org/object?person={artist}&apikey={key}'
df = results_provenance(url)

In [9]:
len(df)

32

## Results with no Provenance

In [10]:
def results(url, df=pd.DataFrame()):
    r = requests.get(url)
    data = r.json()
    info = data['info']
    records = data['records']
    
    # Extract relevant data from each record
    record_data = []
    for record in records:
        record_data.append({
            'title': record['title'],
            'classification': record['classification'],
            'century': record['century']
        })
    
    # Append record data to DataFrame
    df = pd.concat([df, pd.DataFrame(record_data)], ignore_index=True)
    
    # Recursively call pagination function for next page, if it exists
    if 'next' in info:
        return results(info['next'], df)
    else:
        return df

In [11]:
url = f'https://api.harvardartmuseums.org/object?person={artist}&apikey={key}'
results(url)

Unnamed: 0,title,classification,century
0,Tree Trunks,Drawings,19th century
1,Forest Interior,Drawings,19th century
2,House Among Trees,Drawings,19th century
3,Portrait of a Man (Emile Zola?); verso: Study ...,Drawings,19th century
4,Study of Trees,Paintings,19th century
5,Still Life with Game Birds,Paintings,19th century
6,Jules Peyron,Paintings,19th century
7,Plaster Cupid,Paintings,19th century
8,Small Houses in Pontoise,Paintings,19th century
9,The Small Bathers,Prints,19th century


## Download Images 

The function below will download all images for a particular artist, now it could also be that is not only the artist, so we have to do some API research, but that is your job, you need to know what you want to show

You simply have to modify the artist

In [21]:
artist = "Cezanne"

In [22]:

def download_artist_paintings(artist,key):
    # Set up API endpoint and parameters
    endpoint = "https://api.harvardartmuseums.org/object"
    params = {
        "apikey": key,
        "person": artist,
        "classification": "Paintings",
    }

    # Send API request and extract relevant data from JSON response
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()["records"]
    else:
        print(f"Error: {response.status_code}")
        data = []

    # Create directory for images if it doesn't exist
    directory_name = artist.lower().replace(' ', '_') + '_paintings'
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)

    # Download and save images
    for record in data:
        object_number = record["objectnumber"]
        image_url = record["primaryimageurl"]
        if image_url:
            response = requests.get(image_url)
            if response.status_code == 200:
                image_data = response.content
                with open(f"{directory_name}/{object_number}.jpg", "wb") as f:
                    f.write(image_data)
                    print(f"Saved image for {object_number}")
            else:
                print(f"Error downloading image for {object_number}")
        else:
            print(f"No image found for {object_number}")

In [23]:
download_artist_paintings(artist,key)

Saved image for 1998.305
Saved image for 1976.70
Saved image for 1961.144
Saved image for 1964.72
Saved image for 1934.28
Saved image for 1951.46


In [1]:
## Models Prompts and Parsers

In [60]:
import openai
openai.api_key = os.environ['OPENAI_API_KEY']

In [61]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role":"user", "content":prompt}]
    response = openai.ChatCompletion.create(
        model = model,
        messages=messages,
        temperature = 0,
        
    )
    return response.choices[0].message["content"]

In [27]:
get_completion("What is 1+1")

'1+1 equals 2.'

## LangChain

In [14]:
from langchain.chat_models import ChatOpenAI

### Model

In [28]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0)
chat

ChatOpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-3.5-turbo', temperature=0.0, model_kwargs={}, openai_api_key='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', openai_api_base='', openai_organization='', openai_proxy='', request_timeout=None, max_retries=6, streaming=False, n=1, max_tokens=None, tiktoken_model_name=None)

## Prompt Template

In [30]:
customer_email = """
Arrr, I be fuming that me blender lid \
flew off and splattered me kitchen walls \
with smoothie! And to make matters worse,\
the warranty don't cover the cost of \
cleaning up me kitchen. I need yer help \
right now, matey!
"""

In [31]:
style = """American English \
in a calm and respectful tone
"""

In [32]:
prompt = f"""Translate the text \
that is delimited by triple backticks 
into a style that is {style}.
text: ```{customer_email}```
"""

print(prompt)

Translate the text that is delimited by triple backticks 
into a style that is American English in a calm and respectful tone
.
text: ```
Arrr, I be fuming that me blender lid flew off and splattered me kitchen walls with smoothie! And to make matters worse,the warranty don't cover the cost of cleaning up me kitchen. I need yer help right now, matey!
```



In [33]:
template_string = """Translate the text \
that is delimited by triple backticks \
into a style that is {style}. \
text: ```{text}```
"""

In [35]:
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(template_string)


In [36]:
prompt_template.messages[0].prompt

PromptTemplate(input_variables=['style', 'text'], output_parser=None, partial_variables={}, template='Translate the text that is delimited by triple backticks into a style that is {style}. text: ```{text}```\n', template_format='f-string', validate_template=True)

In [37]:
prompt_template.messages[0].prompt.input_variables

['style', 'text']

In [38]:
customer_style = """American English \
in a calm and respectful tone
"""

In [39]:
customer_email = """
Arrr, I be fuming that me blender lid \
flew off and splattered me kitchen walls \
with smoothie! And to make matters worse, \
the warranty don't cover the cost of \
cleaning up me kitchen. I need yer help \
right now, matey!
"""

In [40]:
customer_messages = prompt_template.format_messages(
                    style=customer_style,
                    text=customer_email)

In [41]:
print(type(customer_messages))
print(type(customer_messages[0]))

<class 'list'>
<class 'langchain.schema.messages.HumanMessage'>


In [42]:
print(customer_messages[0])

content="Translate the text that is delimited by triple backticks into a style that is American English in a calm and respectful tone\n. text: ```\nArrr, I be fuming that me blender lid flew off and splattered me kitchen walls with smoothie! And to make matters worse, the warranty don't cover the cost of cleaning up me kitchen. I need yer help right now, matey!\n```\n" additional_kwargs={} example=False


In [43]:
# Call the LLM to translate to the style of the customer message
customer_response = chat(customer_messages)

In [44]:
print(customer_response.content)

I'm really frustrated that my blender lid flew off and made a mess of my kitchen walls with smoothie! And to make things even worse, the warranty doesn't cover the cost of cleaning up my kitchen. I could really use your help right now, my friend!


## Retrieval Augmented Generation

In [10]:
import os
import openai
import sys
sys.path.append('../..')
import panel as pn  # GUI
pn.extension()

In [4]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("C://Users/Francisco.Colina/Documents/Code/ArtPrompt/documents/forged1.pdf")
pages = loader.load()

In [5]:
len(pages)

8

In [6]:
page = pages[0]

In [16]:
print(page.page_content[0:1000])

Detecting Fraud And Forgery In Papers And Documents
The art of detecting forgery or fraud, in checks, drafts, documents,
seals, writing materials, or in the characters themselves is a study
that has attracted handwriting experts since its study was taken up.
There are almost infallible rules for the work and in this chapter is
given several new methods of research that will prove of the utmost
value to the public.
It is not an uncommon occurrence that wills and other public documents
are changed by the insertion of extra or substituted pages, thereby
changing the character of the instrument. Where this is suspected
careful inspection of the paper should be made--first, as to its shade
of color and fiber, under a microscope; second, as to its ruling;
third, as to its water-mark; fourth, as to any indications that the
sheets have been separated since their original attachment; fifth, as
to the writing--whether or not it bears the harmonious character of
the continuous writing, with the s

## Document Splitting

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [19]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [30]:
docs = text_splitter.split_documents(pages)

In [21]:
len(docs)

15

In [22]:
len(pages)

8

## Token Splitting

In [36]:
from langchain.text_splitter import TokenTextSplitter

In [37]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [38]:
docs = text_splitter.split_documents(pages)

In [39]:
docs[0]

Document(page_content='Detecting Fraud And Forgery In Papers And Documents', metadata={'source': 'C://Users/Francisco.Colina/Documents/Code/ArtPrompt/documents/forged1.pdf', 'page': 0})

In [40]:
pages[0].metadata

{'source': 'C://Users/Francisco.Colina/Documents/Code/ArtPrompt/documents/forged1.pdf',
 'page': 0}

In [41]:
# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("C://Users/Francisco.Colina/Documents/Code/ArtPrompt/documents/forged1.pdf"),
    PyPDFLoader("C://Users/Francisco.Colina/Documents/Code/ArtPrompt/documents/congress.pdf"),
    PyPDFLoader("C://Users/Francisco.Colina/Documents/Code/ArtPrompt/documents/dictionary.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [43]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [44]:
splits = text_splitter.split_documents(docs)

In [45]:
len(splits)

852