In [23]:
import os
import openai
import pinecone
import pandas as pd
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()


True

In [24]:
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
print("OPENAI_API_KEY has been set!")

OPENAI_API_KEY has been set!


In [25]:
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")

In [26]:
query = "Which instances can I use with Managed Spot Training in SageMaker?"

context = """
Managed Spot Training can be used with all instances supported in Amazon SageMaker. Managed Spot Training is supported in all AWS Regions where Amazon SageMaker is currently available.
"""
prompt_template = """
Answer the following QUESTION based on the CONTEXT given. If you do not know the answer and the CONTEXT doesn't contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

text_input = prompt_template.format(context=context, question=query)

In [27]:
result = llm.predict(text_input)
print(result)

You can use all instances supported in Amazon SageMaker with Managed Spot Training.


In [28]:
unanswerable_question = "What color is the sea?"

text_input = prompt_template.replace("{context}", context).replace("{question}", unanswerable_question)

out = llm.predict(text_input)

print(out)

I don't know.


In [29]:
df_knowledge = pd.read_csv("data_pinecone.csv", header=None, names=["Question", "Answer"])
df_knowledge.head()

Unnamed: 0,Question,Answer
0,What is Amazon SageMaker?,Amazon SageMaker is a fully managed service to...
1,In which Regions is Amazon SageMaker available...,For a list of the supported Amazon SageMaker A...
2,What is the service availability of Amazon Sag...,Amazon SageMaker is designed for high availabi...
3,How does Amazon SageMaker secure my code?,Amazon SageMaker stores code in ML storage vol...
4,What security measures does Amazon SageMaker h...,Amazon SageMaker ensures that ML model artifac...


In [30]:
df_knowledge.drop(["Question"], axis=1, inplace=True)
df_knowledge.head()

Unnamed: 0,Answer
0,Amazon SageMaker is a fully managed service to...
1,For a list of the supported Amazon SageMaker A...
2,Amazon SageMaker is designed for high availabi...
3,Amazon SageMaker stores code in ML storage vol...
4,Amazon SageMaker ensures that ML model artifac...


In [31]:
api_key = os.getenv('PINECONE_API_KEY')
env = os.getenv('PINECONE_ENV')

pinecone.init(
    api_key=api_key,
    environment=env
)

In [32]:
pinecone.list_indexes()

['freetruth']

In [33]:
openai.api_key = os.getenv('OPENAI_API_KEY')

def get_embeddings(texts, model="text-embedding-ada-002"):
    # Get embeddings for each text in the list
    responses = openai.Embedding.create(input=texts, model=model)

    # Extract embedding vectors from the response
    embeddings = [response['embedding'] for response in responses['data']]

    return embeddings

text = "Your text here"
embedding_vector = get_embeddings(text)
print(len(embedding_vector))


1


In [34]:
from tqdm.auto import tqdm

index_name = "freetruth"

batch_size = 1
vector_limit = 1000

answers = df_knowledge[:vector_limit]
index = pinecone.Index(index_name)

for i in tqdm(range(0, len(answers), batch_size)):

    i_end = min(i+batch_size, len(answers))

    ids = [str(x) for x in range(i, i_end)]

    metadatas = [{'text': text} for text in answers["Answer"][i:i_end]]
    texts = answers["Answer"][i:i_end].tolist()
    embeddings = get_embeddings(texts)
    
    records = zip(ids, embeddings, metadatas)
    
    index.upsert(vectors=records)

100%|██████████| 154/154 [01:23<00:00,  1.84it/s]


In [35]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00154,
 'namespaces': {'': {'vector_count': 154}},
 'total_vector_count': 154}

In [36]:
question = 'Which instances can I use with Managed Spot Training in SageMaker?'

query_vec = get_embeddings(question)[0]

res = index.query(query_vec, top_k=5, include_metadata=True)

res

{'matches': [{'id': '90',
              'metadata': {'text': 'Managed Spot Training can be used with all '
                                   'instances supported in Amazon '
                                   'SageMaker.\r\n'},
              'score': 0.930336177,
              'values': []},
             {'id': '87',
              'metadata': {'text': 'Managed Spot Training uses Amazon EC2 Spot '
                                   'instances for training, and these '
                                   'instances can be pre-empted when AWS needs '
                                   'capacity. As a result, Managed Spot '
                                   'Training jobs can run in small increments '
                                   'as and when capacity becomes available. '
                                   'The training jobs need not be restarted '
                                   'from scratch when there is an '
                                   'interruption, as Amazon SageMake

In [37]:
contexts = [match.metadata['text'] for match in res.matches]

In [38]:
from typing import List

max_section_len = 1000
separator = "\n"

def construct_context(contexts: List[str]) -> str:
    chosen_sections = []
    chosen_sections_len = 0

    for text in contexts:
        text = text.strip()
        # Add contexts until we run out of space.
        chosen_sections_len += len(text) + 2
        if chosen_sections_len > max_section_len:
            break
        chosen_sections.append(text)
    concatenated_doc = separator.join(chosen_sections)
    print(
        f"With maximum sequence length {max_section_len}, selected top {len(chosen_sections)} document sections: \n{concatenated_doc}"
    )
    return concatenated_doc

In [39]:
context_str = construct_context(contexts=contexts)

With maximum sequence length 1000, selected top 2 document sections: 
Managed Spot Training can be used with all instances supported in Amazon SageMaker.
Managed Spot Training uses Amazon EC2 Spot instances for training, and these instances can be pre-empted when AWS needs capacity. As a result, Managed Spot Training jobs can run in small increments as and when capacity becomes available. The training jobs need not be restarted from scratch when there is an interruption, as Amazon SageMaker can resume the training jobs using the latest model checkpoint. The built-in frameworks and the built-in computer vision algorithms with SageMaker enable periodic checkpoints, and you can enable checkpoints with custom models.


In [41]:
text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)

out = llm.predict(text_input)
print(out)


You can use all instances that are supported in Amazon SageMaker with Managed Spot Training.
