**Import Packages**

In [88]:
import os
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
import PyPDF2
import pandas as pd
import textwrap
import pinecone
from tqdm.auto import tqdm
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from dotenv import load_dotenv
import streamlit as st

**Setup the OpenAI key**

In [89]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") 

chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

In [66]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

**Load the PDF and extract it to a text file**

In [64]:
pdf_path = 'MORBO.pdf'


pdf_text = ''
with open(pdf_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    for page in reader.pages:
        pdf_text += page.extract_text() or ''

with open('extracted_text.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(pdf_text)

print(f"Extracted text length: {len(pdf_text)} characters")


Extracted text length: 89062 characters


**Setup Pinecone as the Vector DB**

In [23]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY")

# configure client
pc = Pinecone(api_key=api_key)

In [65]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

Create an Indexfile to store the vector db

In [25]:
import time

index_name = 'llama-2-rag'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

**Convert the textfile into a dataframe**

In [67]:
with open('extracted_text.txt', 'r', encoding='utf-8') as file:
    pdf_text = file.read()

# Split text into chunks
chunk_size = 1000  # Define the chunk size
chunks = textwrap.wrap(pdf_text, chunk_size)

# Create a DataFrame from the chunks
data = pd.DataFrame({
    'text': chunks,
    'doi': ['doi_value']*len(chunks),  # Dummy values; replace with actual data if available
    'chunk-id': range(len(chunks)),    # Sequential chunk ids
    'source': ['source_value']*len(chunks),  # Dummy values; replace with actual data if available
    'title': ['title_value']*len(chunks)  # Dummy values; replace with actual data if available
})

print(data.head())


                                                text        doi  chunk-id  \
0  Multi-Objective Bayesian Optimization over Hig...  doi_value         0   
1  coordinated strategy. We show that MORBO signi...  doi_value         1   
2  maximizing the number of common gauge parts [K...  doi_value         2   
3  general, and sample-efﬁcient approach for “bla...  doi_value         3   
4  groundwater remediation [Akhtar and Shoemaker,...  doi_value         4   

         source        title  
0  source_value  title_value  
1  source_value  title_value  
2  source_value  title_value  
3  source_value  title_value  
4  source_value  title_value  


**Process text and add to Pinecone**

In [69]:
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i + batch_size)
    # Get batch of data
    batch = data.iloc[i:i_end]
    # Generate unique ids for each chunk
    ids = [f"{x['doi']}-{x['chunk-id']}" for _, x in batch.iterrows()]
    # Get text to embed
    texts = [x['text'] for _, x in batch.iterrows()]
    # Embed text
    embeds = embed_model.embed_documents(texts)
    # Get metadata to store in Pinecone
    metadata = [
        {'text': x['text'],
         'source': x['source'],
         'title': x['title']} for _, x in batch.iterrows()
    ]
    # Add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/1 [00:00<?, ?it/s]

In [76]:
text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)



**Prompting**

In [80]:
query = "What is MORBO?"

vectorstore.similarity_search(query, k=3)

[Document(page_content='from most existing methods. The signiﬁcance of MORBO is that it is the ﬁrst multi- objective BO method that scales to hundreds of tunable parameters and thousands of evaluations, a setting where practitioners have previously had to fall back on alterna- tive methods with much lower sample-efﬁciency, such as NSGA-II. Our comprehensive evaluation demonstrates that MORBO yields order-of-magnitude savings in terms of time and resources compared to state-of-the-art methods on chal- lenging high-dimensional multi-objective problems. 2 BACKGROUND 2.1 PRELIMINARIES 2.1.1 Multi-Objective Optimization In multi-objective optimization (MOO), the goal is to max- imize (without loss of generality) a vector-valued objec- tive functionf(x) = [f(1)(x);:::;f(M)(x)]2RM, where M\x152while satisfying black-box constraints g(x)\x1502 RVwhereV\x150,x2X\x1a Rd, andXis a compact set. Usually, there is no single solution x\x03that simultaneously maximizes all Mobjectives and satisﬁes all

In [81]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt

In [82]:
print(augment_prompt(query))

Using the contexts below, answer the query.

    Contexts:
    from most existing methods. The signiﬁcance of MORBO is that it is the ﬁrst multi- objective BO method that scales to hundreds of tunable parameters and thousands of evaluations, a setting where practitioners have previously had to fall back on alterna- tive methods with much lower sample-efﬁciency, such as NSGA-II. Our comprehensive evaluation demonstrates that MORBO yields order-of-magnitude savings in terms of time and resources compared to state-of-the-art methods on chal- lenging high-dimensional multi-objective problems. 2 BACKGROUND 2.1 PRELIMINARIES 2.1.1 Multi-Objective Optimization In multi-objective optimization (MOO), the goal is to max- imize (without loss of generality) a vector-valued objec- tive functionf(x) = [f(1)(x);:::;f(M)(x)]2RM, where M2while satisfying black-box constraints g(x)02 RVwhereV0,x2X Rd, andXis a compact set. Usually, there is no single solution xthat simultaneously maximizes all Mobj

In [83]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand about Entropy in Machine Learning. give latex code")
]

In [84]:
prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages.append(prompt)

res = chat(messages)

print(res.content)

MORBO is a multi-objective Bayesian optimization method that is significant because it is the first method that can scale to handling hundreds of tunable parameters and thousands of evaluations. It outperforms existing methods in terms of time and resources, providing order-of-magnitude savings on challenging high-dimensional multi-objective problems. MORBO incorporates a collaborative multi-trust-region approach with scalable local modeling, allowing it to tackle high-dimensional problems and high-throughput settings effectively. It significantly advances the state-of-the-art in sample efficiency and provides practitioners with substantial improvements over existing methods such as evolutionary algorithms.


In [85]:
prompt = HumanMessage(
    content="what is bo?"
)

res = chat(messages + [prompt])
print(res.content)

In the context provided, BO stands for Bayesian Optimization. It is a technique used in optimization problems to find the optimal set of parameters for a given objective function while minimizing the number of evaluations needed. Bayesian Optimization uses probabilistic models to approximate the objective function and guide the search for the best set of parameters.
