In [None]:
import pandas as pd
import numpy as np
import requests
import os
import json  


In [12]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file


# Project Description

### 1.- Defining Variables

The cell below contains code that searches for an artist using different resources of the Harvard Art Museum API - 

In [49]:
key  = os.environ['HARVARD_API_KEY']
artist = "Cezanne"

The cell below will use the title to find an object with the artist name

In [50]:
r = requests.get(f'https://api.harvardartmuseums.org/object?person={artist}&apikey={key}')
data = r.json()

In [51]:
# Extract the info and records
info = data['info']
records = data['records']

## Results with Provenance

The code below will provide you with a dataframe (like an excel file with the artist information)

In [52]:
artist = "Cezanne"

In [53]:
def results_provenance(url, df=pd.DataFrame()):
    r = requests.get(url)
    data = r.json()
    info = data['info']
    records = data['records']
    
    # Extract relevant data from each record
    record_data = []
    for record in records:
        record_data.append({
            'title': record['title'],
            'classification': record['classification'],
            'century': record['century'],
            'provenance': record['provenance']
        })
    
    # Append record data to DataFrame
    df = pd.concat([df, pd.DataFrame(record_data)], ignore_index=True)
    
    # Recursively call pagination function for next page, if it exists
    if 'next' in info:
        return results_provenance(info['next'], df)
    else:
        return df

In [55]:
url = f'https://api.harvardartmuseums.org/object?person={artist}&apikey={key}'
df = results_provenance(url)

In [56]:
len(df)

32

## Results with no Provenance

In [57]:
def results(url, df=pd.DataFrame()):
    r = requests.get(url)
    data = r.json()
    info = data['info']
    records = data['records']
    
    # Extract relevant data from each record
    record_data = []
    for record in records:
        record_data.append({
            'title': record['title'],
            'classification': record['classification'],
            'century': record['century']
        })
    
    # Append record data to DataFrame
    df = pd.concat([df, pd.DataFrame(record_data)], ignore_index=True)
    
    # Recursively call pagination function for next page, if it exists
    if 'next' in info:
        return results(info['next'], df)
    else:
        return df

In [58]:
url = f'https://api.harvardartmuseums.org/object?person={artist}&apikey={key}'
results(url)

Unnamed: 0,title,classification,century
0,"Rocky Landscape, after Cézanne",Drawings,20th century
1,Tree Trunks,Drawings,19th century
2,Forest Interior,Drawings,19th century
3,House Among Trees,Drawings,19th century
4,Portrait of a Man (Emile Zola?); verso: Study ...,Drawings,19th century
5,Study of Trees,Paintings,19th century
6,Still Life with Game Birds,Paintings,19th century
7,Jules Peyron,Paintings,19th century
8,Plaster Cupid,Paintings,19th century
9,Small Houses in Pontoise,Paintings,19th century


## Download Images 

The function below will download all images for a particular artist, now it could also be that is not only the artist, so we have to do some API research, but that is your job, you need to know what you want to show

You simply have to modify the artist

In [21]:
artist = "Cezanne"

In [22]:

def download_artist_paintings(artist,key):
    # Set up API endpoint and parameters
    endpoint = "https://api.harvardartmuseums.org/object"
    params = {
        "apikey": key,
        "person": artist,
        "classification": "Paintings",
    }

    # Send API request and extract relevant data from JSON response
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()["records"]
    else:
        print(f"Error: {response.status_code}")
        data = []

    # Create directory for images if it doesn't exist
    directory_name = artist.lower().replace(' ', '_') + '_paintings'
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)

    # Download and save images
    for record in data:
        object_number = record["objectnumber"]
        image_url = record["primaryimageurl"]
        if image_url:
            response = requests.get(image_url)
            if response.status_code == 200:
                image_data = response.content
                with open(f"{directory_name}/{object_number}.jpg", "wb") as f:
                    f.write(image_data)
                    print(f"Saved image for {object_number}")
            else:
                print(f"Error downloading image for {object_number}")
        else:
            print(f"No image found for {object_number}")

In [23]:
download_artist_paintings(artist,key)

Saved image for 1998.305
Saved image for 1976.70
Saved image for 1961.144
Saved image for 1964.72
Saved image for 1934.28
Saved image for 1951.46


In [1]:
## Models Prompts and Parsers

In [3]:
!pip install openai
!pip install os
!pip install dotenv
!pip install langchain
!pip install chroma
!pip install tiktoken

Collecting openai
  Using cached openai-0.27.8-py3-none-any.whl (73 kB)
Collecting tqdm (from openai)
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting aiohttp (from openai)
  Using cached aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Using cached multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
Collecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->openai)
  Using cached async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->openai)
  Using cached yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (268 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->openai)
  Using cached frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->openai)
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB

In [4]:
import openai
import os
openai.api_key = os.environ['OPENAI_API_KEY']

In [5]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role":"user", "content":prompt}]
    response = openai.ChatCompletion.create(
        model = model,
        messages=messages,
        temperature = 0,
        
    )
    return response.choices[0].message["content"]

In [6]:
get_completion("What is 1+1")

'1+1 equals 2.'

## LangChain

In [9]:
!pip install langchain

Collecting langchain
  Using cached langchain-0.0.253-py3-none-any.whl (1.4 MB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Using cached SQLAlchemy-2.0.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Using cached dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting langsmith<0.1.0,>=0.0.11 (from langchain)
  Using cached langsmith-0.0.19-py3-none-any.whl (31 kB)
Collecting numexpr<3.0.0,>=2.8.4 (from langchain)
  Using cached numexpr-2.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Using cached openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
Collecting pydantic<2,>=1 (from langchain)
  Using cached pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain)
  Using cached marshmallow-3.2

In [10]:
from langchain.chat_models import ChatOpenAI

### Model

In [11]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0)
chat

ChatOpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-3.5-turbo', temperature=0.0, model_kwargs={}, openai_api_key='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', openai_api_base='', openai_organization='', openai_proxy='', request_timeout=None, max_retries=6, streaming=False, n=1, max_tokens=None, tiktoken_model_name=None)

## Prompt Template

In [12]:
customer_email = """
Arrr, I be fuming that me blender lid \
flew off and splattered me kitchen walls \
with smoothie! And to make matters worse,\
the warranty don't cover the cost of \
cleaning up me kitchen. I need yer help \
right now, matey!
"""

In [13]:
style = """American English \
in a calm and respectful tone
"""

In [14]:
prompt = f"""Translate the text \
that is delimited by triple backticks 
into a style that is {style}.
text: ```{customer_email}```
"""

print(prompt)

Translate the text that is delimited by triple backticks 
into a style that is American English in a calm and respectful tone
.
text: ```
Arrr, I be fuming that me blender lid flew off and splattered me kitchen walls with smoothie! And to make matters worse,the warranty don't cover the cost of cleaning up me kitchen. I need yer help right now, matey!
```



In [15]:
template_string = """Translate the text \
that is delimited by triple backticks \
into a style that is {style}. \
text: ```{text}```
"""

In [16]:
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(template_string)


In [17]:
prompt_template.messages[0].prompt

PromptTemplate(input_variables=['style', 'text'], output_parser=None, partial_variables={}, template='Translate the text that is delimited by triple backticks into a style that is {style}. text: ```{text}```\n', template_format='f-string', validate_template=True)

In [18]:
prompt_template.messages[0].prompt.input_variables

['style', 'text']

In [19]:
customer_style = """American English \
in a calm and respectful tone
"""

In [20]:
customer_email = """
Arrr, I be fuming that me blender lid \
flew off and splattered me kitchen walls \
with smoothie! And to make matters worse, \
the warranty don't cover the cost of \
cleaning up me kitchen. I need yer help \
right now, matey!
"""

In [21]:
customer_messages = prompt_template.format_messages(
                    style=customer_style,
                    text=customer_email)

In [22]:
print(type(customer_messages))
print(type(customer_messages[0]))

<class 'list'>
<class 'langchain.schema.messages.HumanMessage'>


In [23]:
print(customer_messages[0])

content="Translate the text that is delimited by triple backticks into a style that is American English in a calm and respectful tone\n. text: ```\nArrr, I be fuming that me blender lid flew off and splattered me kitchen walls with smoothie! And to make matters worse, the warranty don't cover the cost of cleaning up me kitchen. I need yer help right now, matey!\n```\n" additional_kwargs={} example=False


In [24]:
# Call the LLM to translate to the style of the customer message
customer_response = chat(customer_messages)

In [25]:
print(customer_response.content)

I'm really frustrated that my blender lid flew off and made a mess of my kitchen walls with smoothie! And to make things even worse, the warranty doesn't cover the cost of cleaning up my kitchen. I could really use your help right now, my friend!


## Retrieval Augmented Generation

In [27]:
!pip install pypdf

Collecting pypdf
  Using cached pypdf-3.14.0-py3-none-any.whl (269 kB)
Installing collected packages: pypdf
Successfully installed pypdf-3.14.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [28]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("documents/forged1.pdf")
pages = loader.load()

In [29]:
len(pages)

8

In [30]:
page = pages[0]

In [31]:
print(page.page_content[0:1000])

Detecting Fraud And Forgery In Papers And Documents
The art of detecting forgery or fraud, in checks, drafts, documents,
seals, writing materials, or in the characters themselves is a study
that has attracted handwriting experts since its study was taken up.
There are almost infallible rules for the work and in this chapter is
given several new methods of research that will prove of the utmost
value to the public.
It is not an uncommon occurrence that wills and other public documents
are changed by the insertion of extra or substituted pages, thereby
changing the character of the instrument. Where this is suspected
careful inspection of the paper should be made--first, as to its shade
of color and fiber, under a microscope; second, as to its ruling;
third, as to its water-mark; fourth, as to any indications that the
sheets have been separated since their original attachment; fifth, as
to the writing--whether or not it bears the harmonious character of
the continuous writing, with the s

## Document Splitting

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [33]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [34]:
docs = text_splitter.split_documents(pages)

In [35]:
len(docs)

15

In [36]:
len(pages)

8

## Token Splitting

In [37]:
from langchain.text_splitter import TokenTextSplitter

In [38]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [39]:
docs = text_splitter.split_documents(pages)

In [40]:
docs[0]

Document(page_content='Detecting Fraud And Forgery In Papers And Documents', metadata={'source': 'documents/forged1.pdf', 'page': 0})

In [41]:
pages[0].metadata

{'source': 'documents/forged1.pdf', 'page': 0}

In [42]:
# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("documents/forged1.pdf"),
    PyPDFLoader("documents/congress.pdf"),
    PyPDFLoader("documents/dictionary.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [43]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [44]:
splits = text_splitter.split_documents(docs)

In [45]:
len(splits)

852

## Vectors an Embeddings

In [50]:
!pip install chromadb

Collecting chromadb
  Using cached chromadb-0.4.5-py3-none-any.whl (402 kB)
Collecting chroma-hnswlib==0.7.2 (from chromadb)
  Using cached chroma_hnswlib-0.7.2-cp310-cp310-linux_x86_64.whl
Collecting fastapi<0.100.0,>=0.95.2 (from chromadb)
  Using cached fastapi-0.99.1-py3-none-any.whl (58 kB)
Collecting uvicorn[standard]>=0.18.3 (from chromadb)
  Using cached uvicorn-0.23.2-py3-none-any.whl (59 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-3.0.1-py2.py3-none-any.whl (37 kB)
Collecting pulsar-client>=3.1.0 (from chromadb)
  Using cached pulsar_client-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Using cached onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
Collecting tokenizers>=0.13.2 (from chromadb)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting pypika>=0.48.9 (from chromadb)
  Usin

In [51]:
from langchain.vectorstores import Chroma

In [52]:
persist_directory = 'docs/chroma/'

In [53]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [54]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

RuntimeError: [91mYour system has an unsupported version of sqlite3. Chroma requires sqlite3 >= 3.35.0.[0m
[94mPlease visit https://docs.trychroma.com/troubleshooting#sqlite to learn how to upgrade.[0m