In [13]:
import os
import queue
import threading
from pprint import pprint

import pinecone
import openai
import streamlit as st
import tiktoken
from defaults import MODEL, GITHUB_BASE_URL, ROOT_DIR
from dotenv import load_dotenv
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from IPython.display import Markdown
from langchain.document_loaders import UnstructuredHTMLLoader



# Load environment variables from a .env file (containing OPENAI_API_KEY)
load_dotenv()
# Set the OpenAI API key from the environment variable

openai.api_key = os.environ.get("OPENAI_API_KEY")


In [14]:
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp-free")

In [38]:
DB = Pinecone(
        index=pinecone.Index("hpe-sec"),
        embedding_function=OpenAIEmbeddings().embed_query,
        text_key="text",
        namespace="hpe-sec",
    )


In [None]:
DB.similarity_search("HPE SEC filings")[0]

# Chunking Data

In [26]:
??UnstructuredHTMLLoader

In [27]:
import pathlib

In [31]:
pathlib.Path("/Users/garrett/28775_28776_logs.txt").stem

'28775_28776_logs'

In [32]:
??OpenAIEmbeddings

In [33]:
text = "This is a test query."
query_result = OpenAIEmbeddings().embed_query(text)

In [36]:
len(query_result)

1536

In [37]:
pinecone.create_index("hpe-sec", dimension=1536)

In [24]:
docs = []
loader = UnstructuredHTMLLoader("HPE_10-K_000162828018015054.html")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
split_docs = loader.load_and_split(splitter)
docs.extend(split_docs)

In [25]:
docs[0]

Document(page_content="Table of Contents\n\nUNITED STATES\n\nSECURITIES AND EXCHANGE COMMISSION\n\nWashington, D.C. 20549\n\nFORM 10-K\n\n(Mark One)\n\nANNUAL REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF\xa01934\n\nFor the fiscal year ended October\xa031, 2018\n\nOr\n\nTRANSITION REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF\xa01934\n\nFor the transition period from\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0to\n\nCommission file number\xa0001-37483\n\nHEWLETT PACKARD ENTERPRISE COMPANY\n\n(Exact name of registrant as specified in its charter)\n\nDelaware\n\n47-3298624\n\n(State or other jurisdiction ofincorporation or organization)\n\n(I.R.S. employeridentification no.)\n\n3000 Hanover Street, Palo Alto, California\n\n94304\n\n(Address of principal executive offices)\n\n(Zip code)\n\nRegistrant's telephone number, including area code:\n\n(650)\xa0687-5817\n\nSecurities registered pursuant to Section\xa012(b) of the Act:\

In [19]:
data

[Document(page_content='Table of Contents\n\nUNITED STATES\n\nSECURITIES AND EXCHANGE COMMISSION\n\nWashington, D.C. 20549\n\nFORM 10-K\n\n(Mark One)\n\nANNUAL REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF\xa01934\n\nFor the fiscal year ended October\xa031, 2018\n\nOr\n\nTRANSITION REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF\xa01934\n\nFor the transition period from\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0to\n\nCommission file number\xa0001-37483\n\nHEWLETT PACKARD ENTERPRISE COMPANY\n\n(Exact name of registrant as specified in its charter)\n\nDelaware\n\n47-3298624\n\n(State or other jurisdiction ofincorporation or organization)\n\n(I.R.S. employeridentification no.)\n\n3000 Hanover Street, Palo Alto, California\n\n94304\n\n(Address of principal executive offices)\n\n(Zip code)\n\nRegistrant\'s telephone number, including area code:\n\n(650)\xa0687-5817\n\nSecurities registered pursuant to Section\xa012(b) of the Act

In [2]:
def load_docs(root_dir, github_base_url=None, splitter=None):
    docs = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            if file.split(".")[-1] not in (
                "png",
                "jpg",
                "ico",
                "svg",
                "bin",
                "woff2",
                "woff",
                "h5",
                "pkl",
                "path",
                "index",
                "ttf",
                "gif",
                "jpeg",
                "otf",
                "zip",
                "idx",
                "pt",
                "pth",
                "meta",
                "eot",
                "map",
            ):
                try:
                    path = os.path.join(dirpath, file)
                    loader = TextLoader(path, encoding="utf-8")
                    split_docs = loader.load_and_split(splitter)
                    if github_base_url is not None:
                        for doc in split_docs:
                            doc.metadata["url"] = (
                                github_base_url + path[len(root_dir) :]
                            )
                    docs.extend(split_docs)
                except Exception as e:
                    print(
                        f"Exception {e} raised for file {file} but ignored; continuing."
                    )
    return docs

In [3]:
docs = load_docs(
    ROOT_DIR,
    GITHUB_BASE_URL,
    RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0),
)


Exception 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte raised for file determined-keras-model but ignored; continuing.
Exception 'utf-8' codec can't decode byte 0x86 in position 3: invalid start byte raised for file saved_model.pb but ignored; continuing.
Exception 'utf-8' codec can't decode byte 0x86 in position 3: invalid start byte raised for file saved_model.pb but ignored; continuing.
Exception 'utf-8' codec can't decode byte 0xb7 in position 1: invalid start byte raised for file determined-keras-model-weights.data-00000-of-00001 but ignored; continuing.
Exception 'utf-8' codec can't decode byte 0x83 in position 18: invalid start byte raised for file determined-keras-model-weights.data-00000-of-00001 but ignored; continuing.
Exception 'utf-8' codec can't decode byte 0xb7 in position 1: invalid start byte raised for file determined-keras-model-weights.data-00000-of-00001 but ignored; continuing.
Exception 'utf-8' codec can't decode byte 0xfd in position 10

In [4]:
pprint(docs[100])

Document(page_content='self.clip_grads_fn = None\n        if self.cfg.optimizer_config.grad_clip is not None:\n            self.clip_grads_fn = lambda x: torch.nn.utils.clip_grad_norm_(\n                x,\n                self.cfg.optimizer_config.grad_clip.max_norm,\n                self.cfg.optimizer_config.grad_clip.norm_type,\n            )\n\n        # mmdet sets loggers in the package that interrupt with Determined logging.\n        # We reset the root logger after mmdet models are initialized.\n        set_logger(bool(self.context.get_experiment_config().get("debug", False)))\n\n    def build_mmdet_config(self) -> mmcv.Config:\n        """\n        Apply overrides to the mmdet config according to the following experiment config fields:\n        - data.file_client_args\n        - hyperparameters.merge_config\n        - hyperparameters.override_mmdet_config.', metadata={'source': 'determined/model_hub/model_hub/mmdetection/_trial.py', 'url': 'https://github.com/determined-ai/dete

In [5]:
len(docs)

29406

Uploading vectorized docs:

In [6]:
# embeddings = OpenAIEmbeddings()
# deep_lake_path = os.environ.get("DEEPLAKE_DATASET_PATH")
# db = DeepLake(dataset_path=deep_lake_path, embedding_function=embeddings)
# db.add_documents(docs)


# Vector DB

The `DB` object is the vectorDB

In [7]:
pprint(DB.similarity_search("what does `report_completed` do?", k=10))

[Document(page_content='def report_completed(self, searcher_metric: Any) -> None:\n        """\n        ``report_completed()`` is the final step of a train-validate-report cycle.', metadata={'source': 'determined/harness/determined/core/_searcher.py', 'url': 'https://github.com/determined-ai/determined/blob/main/harness/determined/core/_searcher.py'}),
 Document(page_content='``report_completed()`` requires the value of the metric you are searching over.  This value\n        is typically the output of the "validate" step of the train-validate-report cycle.\n        In most cases `searcher_metric` should be a `float` but custom search methods\n        may use any json-serializable type as searcher metric.\n        """\n        if not self._is_chief:\n            raise RuntimeError("you must only call op.report_completed() from the chief worker")\n        if self._completed:\n            raise RuntimeError("you may only call op.report_completed() once")\n        self._completed = True\n 

# Prompt Engineering

Step 1: get the relevant docs for the query:

In [8]:
query = "What does `report_validation_metrics` do? What is the signature of this method?  Can I call `report_validation_metrics` twice on the same step?"
pprint(query)

('What does `report_validation_metrics` do? What is the signature of this '
 'method?  Can I call `report_validation_metrics` twice on the same step?')


In [10]:
def get_context_from_query(query: str, k: int = 5) -> str:
    docs = DB.similarity_search(query, k)

    context = "\n\n".join(
        [
            f'From file {d.metadata["source"]} (github url {d.metadata["url"]}):\n'
            + str(d.page_content)
            for d in docs
        ]
    )

    return context

In [11]:
context = get_context_from_query(query)
pprint(context)

('From file determined/docs/training/apis-howto/api-core-ug.rst (github url '
 'https://github.com/determined-ai/determined/blob/main/docs/training/apis-howto/api-core-ug.rst):\n'
 '.. literalinclude:: ../../../examples/tutorials/core_api/1_metrics.py\n'
 '      :language: python\n'
 '      :pyobject: main\n'
 '\n'
 '   The ``report_validation_metrics()`` call typically happens after the '
 'validation step, however,\n'
 '   actual validation is not demonstrated by this example.\n'
 '\n'
 '#. Create a ``1_metrics.yaml`` file with an ``entrypoint`` invoking the new '
 '``1_metrics.py`` file.\n'
 '   You can copy the ``0_start.yaml`` configuration file and change the first '
 'couple of lines:\n'
 '\n'
 '   .. literalinclude:: ../../../examples/tutorials/core_api/1_metrics.yaml\n'
 '      :language: yaml\n'
 '      :lines: 1-2\n'
 '\n'
 '#. Run the code using the command:\n'
 '\n'
 '   .. code:: bash\n'
 '\n'
 '      det e create 1_metrics.yaml . -f\n'
 '\n'
 '#. You can now navigate to 

In [12]:
def get_system_prompt_with_context(context: str) -> str:
    system_prompt_template = """Given the following context and code, answer the following question about the Determined AI Github Repo (url: http://www.github.com/determined-ai/determined/). Do not use outside context, and do not assume the user can see the provided context. Try to be as detailed as possible and reference the components that you are looking at. Keep in mind that these are only code snippets, and more snippets may be added during the conversation.
        Do not generate code, only reference the exact code snippets that you have been provided with. If you are going to write code, make sure to specify the language of the code and write the result in markdown. For example, if you were writing Python, you would write the following:

        ```python
        <python code goes here>
        ```
        
        Now, here is the relevant context: 

        Context: {context}
        """
    return system_prompt_template.format(context=context)


def get_context_from_prompt(prompt: str, k: int = 2) -> str:
    docs = DB.similarity_search(prompt, k)

    context = "\n\n".join(
        [
            f'From file {d.metadata["source"]} (github url {d.metadata["url"]}):\n'
            + str(d.page_content)
            for d in docs
        ]
    )

    return context


In [13]:
system_prompt = get_system_prompt_with_context(context)
pprint(system_prompt)

('Given the following context and code, answer the following question about '
 'the Determined AI Github Repo (url: '
 'http://www.github.com/determined-ai/determined/). Do not use outside '
 'context, and do not assume the user can see the provided context. Try to be '
 'as detailed as possible and reference the components that you are looking '
 'at. Keep in mind that these are only code snippets, and more snippets may be '
 'added during the conversation.\n'
 '        Do not generate code, only reference the exact code snippets that '
 'you have been provided with. If you are going to write code, make sure to '
 'specify the language of the code and write the result in markdown. For '
 'example, if you were writing Python, you would write the following:\n'
 '\n'
 '        ```python\n'
 '        <python code goes here>\n'
 '        ```\n'
 '        \n'
 '        Now, here is the relevant context: \n'
 '\n'
 '        Context: From file '
 'determined/docs/training/apis-howto/api-core-

# Getting Answers, Finally

Different forms of messages: `SystemMessage`, `HumanMessage` and `AIMessage`

In [14]:
full_prompt = [SystemMessage(content=system_prompt), HumanMessage(content=query)] 

In [15]:
llm = ChatOpenAI(
            model_name=MODEL,
            temperature=0.7,
        )

In [16]:
answer = llm(full_prompt).content

In [17]:
display(Markdown(answer))

`report_validation_metrics` is a method that reports validation metrics to the system. The signature of this method is:

```python
def report_validation_metrics(self, steps_completed: int, metrics: Dict[str, Any]) -> None:
```

`steps_completed` is an integer parameter that indicates the number of training steps completed when the validation metrics are reported. `metrics` is a dictionary of validation metrics to report.

It is not recommended to call `report_validation_metrics` twice on the same step. If you do so, the system may interpret the second call as overwriting the first rather than reporting additional information.