In [1]:

import os
import openai
import pinecone
import pandas as pd
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [None]:
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
print("OPENAI_API_KEY has been set!")

In [None]:
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")

In [None]:
query = "Which instances can I use with Managed Spot Training in SageMaker?"

context = """
Managed Spot Training can be used with all instances supported in Amazon SageMaker. Managed Spot Training is supported in all AWS Regions where Amazon SageMaker is currently available.
"""
prompt_template = """
Answer the following QUESTION based on the CONTEXT given. If you do not know the answer and the CONTEXT doesn't contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

text_input = prompt_template.format(context=context, question=query)

In [None]:
result = llm.predict(text_input)
print(result)

In [None]:
unanswerable_question = "What color is the sea?"

text_input = prompt_template.replace("{context}", context).replace("{question}", unanswerable_question)

out = llm.predict(text_input)

print(out)

In [None]:
df_knowledge = pd.read_csv("data_pinecone.csv", header=None, names=["Question", "Answer"])
df_knowledge.head()

In [None]:
df_knowledge.drop(["Question"], axis=1, inplace=True)
df_knowledge.head()

In [None]:
api_key = os.getenv('PINECONE_API_KEY')
env = os.getenv('PINECONE_ENV')

pinecone.init(
    api_key=api_key,
    environment=env
)

In [None]:
pinecone.list_indexes()

In [None]:
openai.api_key = os.getenv('OPENAI_API_KEY')

def get_embeddings(texts, model="text-embedding-ada-002"):
    # Get embeddings for each text in the list
    responses = openai.Embedding.create(input=texts, model=model)

    # Extract embedding vectors from the response
    embeddings = [response['embedding'] for response in responses['data']]

    return embeddings

text = "Your text here"
embedding_vector = get_embeddings(text)
print(len(embedding_vector))


In [None]:
from tqdm.auto import tqdm

index_name = "freetruth"

batch_size = 1
vector_limit = 1000

answers = df_knowledge[:vector_limit]
index = pinecone.Index(index_name)

for i in tqdm(range(0, len(answers), batch_size)):

    i_end = min(i+batch_size, len(answers))

    ids = [str(x) for x in range(i, i_end)]

    metadatas = [{'text': text} for text in answers["Answer"][i:i_end]]
    texts = answers["Answer"][i:i_end].tolist()
    embeddings = get_embeddings(texts)
    
    records = zip(ids, embeddings, metadatas)
    
    index.upsert(vectors=records)

In [None]:
index.describe_index_stats()

In [None]:
question = 'Which instances can I use with Managed Spot Training in SageMaker?'

query_vec = get_embeddings(question)[0]

res = index.query(query_vec, top_k=5, include_metadata=True)

res

In [None]:
contexts = [match.metadata['text'] for match in res.matches]

In [None]:
from typing import List

max_section_len = 1000
separator = "\n"

def construct_context(contexts: List[str]) -> str:
    chosen_sections = []
    chosen_sections_len = 0

    for text in contexts:
        text = text.strip()
        # Add contexts until we run out of space.
        chosen_sections_len += len(text) + 2
        if chosen_sections_len > max_section_len:
            break
        chosen_sections.append(text)
    concatenated_doc = separator.join(chosen_sections)
    print(
        f"With maximum sequence length {max_section_len}, selected top {len(chosen_sections)} document sections: \n{concatenated_doc}"
    )
    return concatenated_doc

In [None]:
context_str = construct_context(contexts=contexts)

In [None]:
text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)

out = llm.predict(text_input)
print(out)
