In [None]:
!pip install langchain pypdf opensearch-py==2.2.0 -q

## Create embedding class

In [None]:
import json
from typing import List
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler

In [None]:
class EmbeddingsEndpoint(SagemakerEndpointEmbeddings):
    def embed_documents(self, texts: List[str], chunk_size: int = 5) -> List[List[float]]:
        results = []
        _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size

        for i in range(0, len(texts), _chunk_size):
            response = self._embedding_func(texts[i : i + _chunk_size])
            print
            results.extend(response)
            
        return results


class EmbeddingsHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        embeddings = response_json["embedding"]
        return embeddings


embeddings_content_handler = EmbeddingsHandler()

embeddings = EmbeddingsEndpoint(
    endpoint_name="jumpstart-dft-hf-textembedding-all-minilm-l6-v2",
    region_name="us-east-1",
    content_handler=embeddings_content_handler,
)

## Load PDF, split in documents and embed

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.text_splitter import CharacterTextSplitter

In [None]:
OPENSEARCH_USERNAME = "workshop"
OPENSEARCH_PASSWORD = "Admin_123"
OPENSEARCH_CLUSTER = "search-workshop-lnxmjdzypjzwv6l73uhm2twlbi"
OPENSEARCH_DOMAIN = f"https://{OPENSEARCH_USERNAME}:{OPENSEARCH_PASSWORD}@{OPENSEARCH_CLUSTER}.us-east-1.es.amazonaws.com"
OPENSEARCH_INDEX = "documents-martinig" # create one

In [None]:
file = "cidadededeus.pdf"
pdf_path = "./documents/" + file

In [None]:
loader = PyPDFLoader(pdf_path)

In [None]:
documents = loader.load()
documents

In [None]:
docsearch = OpenSearchVectorSearch.from_documents(
        documents=documents,
        opensearch_url=OPENSEARCH_DOMAIN,
        index_name=OPENSEARCH_INDEX,
        embedding=embeddings,
        engine="lucene"
    )

In [None]:
result_docs = docsearch.similarity_search("qual o nome do filme", k=3, efficient_filter={"bool": {"filter":{"term": {"metadata.source":"cidadededeus.pdf"}}}}, search_type="approximate_search")
result_docs

## Create LLM class

In [None]:
from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
from langchain.chains.question_answering import load_qa_chain

In [None]:
llm_parameters = {
    "max_new_tokens": 2048,
    "top_p": 0.1,
    "temperature": 0.7,
}

class LLMHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: dict) -> bytes:
        input_str = json.dumps({"inputs" : [[{"role" : "system",
        "content" : "Você irá responder a pergunta baseado no contexto e se você não sabe diga que não sabe. Responda em português."},
        {"role" : "user", "content" : prompt}]],
        "parameters" : {**model_kwargs}})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json[0]["generation"]["content"]


llm_content_handler = LLMHandler()

sm_llm = SagemakerEndpoint(
    endpoint_name="jumpstart-dft-meta-textgeneration-llama-2-7b-f",
    region_name="us-east-1",
    model_kwargs=llm_parameters,
    content_handler=llm_content_handler,
    endpoint_kwargs={"CustomAttributes": 'accept_eula=true'},
)

In [None]:
llm_chain = load_qa_chain(
    llm=sm_llm,
    chain_type="stuff"
)

In [None]:
llm_chain.run(input_documents=result_docs, question="Qual o título do filme?")

In [None]:
answer = llm_chain({"input_documents": result_docs, "question": "Qual o título do filme?"})

In [None]:
answer

In [None]:
resposta = answer["output_text"]
paginas = answer["input_documents"][0].metadata["page"]
print(f"{resposta} \n A fonte dessa resposta é pg. {paginas}")

In [None]:
from langchain.chains.summarize import load_summarize_chain

In [None]:
chain = load_summarize_chain(llm=sm_llm, chain_type="map_reduce")

In [None]:
summary = chain.run(documents)

In [None]:
summary