## Imports

In [50]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [192]:
try:
    import pymupdf as fitz  # available with v1.24.3
except ImportError:
    import fitz

from fitz import Document as FitzDocument
import tensorflow_hub as hub
import numpy as np
from sklearn.neighbors import NearestNeighbors
from openai import OpenAI
from IPython.display import display, Markdown, JSON

from pprint import pprint
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import re
import os

# from llama_index.readers.file import PyMuPDFReader

from dotenv import load_dotenv
load_dotenv()

True

## Load data

In [58]:
! cp ../backend/data/*.pdf ./pdfs/

In [59]:
pdf_path="./pdfs/Outerbounds-ML-Jan30-Reduced.pdf"

In [60]:
doc = fitz.open(pdf_path)
assert doc.is_pdf

In [61]:
print(f"Number of pages: {doc.page_count}")
print(f"Metadata: ", end='')
pprint(doc.metadata)

Number of pages: 18
Metadata: {'author': '',
 'creationDate': "D:20240130113640-08'00'",
 'creator': 'Acrobat Pro 23.8.20470',
 'encryption': None,
 'format': 'PDF 1.7',
 'keywords': '',
 'modDate': "D:20240130113744-08'00'",
 'producer': 'Acrobat Pro 23.8.20470',
 'subject': '',
 'title': '',
 'trapped': ''}


In [62]:
pprint(doc.get_toc())

[[1, 'Cover', 1],
 [1, '1', 2],
 [1, '2', 3],
 [1, '3', 4],
 [1, '4', 5],
 [1, '5', 6],
 [1, '6', 7],
 [1, '7', 8],
 [1, '8', 9],
 [1, '9', 10],
 [1, '10', 11],
 [1, '11', 12],
 [1, '12', 13],
 [1, '13', 14],
 [1, '14', 15],
 [1, '15', 16],
 [1, '16', 17],
 [1, 'Back', 18]]


## Convert to text

In [158]:
def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text


def pdf_to_text(
    path,
    start_page=1, 
    end_page=None
):
    doc = fitz.open(path)
    total_pages = doc.page_count
    if end_page is None:
        end_page = total_pages
    text_list = []
    for i in range(start_page - 1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append({'content': text, 'page': i + 1})
    doc.close()
    return text_list


def text_to_chunks(
    texts, 
    word_length=150, 
    start_page=1
):
    text_toks = [
        (t['content'].split(' '), t['page'])
        for t in texts
    ]
    chunks = []

    for idx, words_and_page in enumerate(text_toks):
        words = words_and_page[0]
        page = words_and_page[1]
        for i in range(0, len(words), word_length):
            chunk = words[i : i + word_length]
            if (
                (i + word_length) > len(words)
                and (len(chunk) < word_length)
                and (len(text_toks) != (idx + 1))
            ):
                # text_toks[idx + 1] = chunk + text_toks[idx + 1]
                text_toks[idx + 1] = (
                    chunk + text_toks[idx + 1][0],
                    text_toks[idx + 1][1],
                )
                continue
            chunk = ' '.join(chunk).strip()
            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
            chunks.append((chunk, page))

    return chunks

In [159]:
# https://github.com/bhaskatripathi/pdfGPT/blob/main/api.py#L105
class SemanticSearch:
    
    def __init__(self):
        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        self.fitted = False

    def fit(self, data, batch=1000, n_neighbors=5):
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True

    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors

    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i : (i + batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings

In [160]:
text_ls = pdf_to_text(pdf_path)

In [161]:
text_ls[:5]

[{'content': 'ML/January 2024 A developer-friendly platform for ML+AI systems ',
  'page': 1},
 {'content': 'Background 1 Outerbounds was spun off from Netflix in 2021. At Netflix, Outerbounds’ founders led ML and AI infrastructure, encoding the best practices of rapid ML/ AI development into an open-source library Metaflow, with a particular focus on human-centric, productivity- boosting developer experience. In addition to powering most ML/AI projects at Netflix today, Metaflow has become an industry-standard tool for production ML/AI systems, adopted by hundreds of leading companies. It powers a wide range of use cases from financial fraud detection and biotech to autonomous drones and custom large language models. Outerbounds builds on the foundation laid by Metaflow by offering it as a part of a fully managed, secure, cost- effective ML and AI platform. ',
  'page': 2},
 {'content': 'Scenario Continuously updating ML with structured data 2 Let’s take a look at a typical business-o

In [162]:
chunks = text_to_chunks(text_ls)

In [167]:
display(Markdown(chunks[0][0]))

[Page no. 3] "ML/January 2024 A developer-friendly platform for ML+AI systems  Background 1 Outerbounds was spun off from Netflix in 2021. At Netflix, Outerbounds’ founders led ML and AI infrastructure, encoding the best practices of rapid ML/ AI development into an open-source library Metaflow, with a particular focus on human-centric, productivity- boosting developer experience. In addition to powering most ML/AI projects at Netflix today, Metaflow has become an industry-standard tool for production ML/AI systems, adopted by hundreds of leading companies. It powers a wide range of use cases from financial fraud detection and biotech to autonomous drones and custom large language models. Outerbounds builds on the foundation laid by Metaflow by offering it as a part of a fully managed, secure, cost- effective ML and AI platform.  Scenario Continuously updating ML with structured data 2 Let’s take a look at a typical business-oriented ML system. The system ingests data from a"

In [163]:
recommender = SemanticSearch()

In [168]:
recommender.fit([c[0] for c in chunks])

In [169]:
question = 'What does Outerbounds do?'

In [170]:
topn_chunks = recommender(question)

In [184]:
topn_chunks

['[Page no. 8] "click - or more often, through a CI/CD system. Develop production-ready workflows quickly with open- source Metaflow that has been battle-hardened for years at Netflix and other leading companies. Deploy workflows with a single click and make them run automatically in stable, isolated execution environments, connected to other systems upstream and downstream. Build increasingly advanced systems incrementally by composing larger flows from individual components, dividing responsibilities across teams. Focus on operating your data, models, and applications with full visibility - Outerbounds keeps the foundational infrastructure running. start foreach_account process_account_date branch_step_pages foreach_one_step_page join_foreach_one_step_page branch_node_pages foreach_node_day_page create_history_view Runs Daily data 1 Daily data 2 Daily data 3 + Trigger + Trigger + Trigger Deployments Outerbounds APP 5.20 PM TrainingFlow/argo-3c245e6 succeeded Flow completed in 28m 2s 

In [182]:
chunks[13]

('[Page no. 15] "How does the managed Outerbounds platform differ from open-source Metaflow? Outerbounds Developer-friendly API Same open-source Metaflow Yes Yes Yes Yes Basic version in OSS Basic version in OSS Basic version in OSS Basic version in OSS Included Included Included Included Included Included Included Included Included Included Included Included w/ additional features Managed and optimized Managed and optimized Same open-source Metaflow Same open-source Metaflow Same open-source Metaflow Same open-source Metaflow No lock-in, build apps with open-source APIs Version and track everything Simple access to scalable compute Deploy to production with a single click Deploys securely in your cloud account Unlimited compute at no extra cost Secure data integrations Scalable compute backend Highly-available production orchestration Durable metadata Cloud workstations Comprehensive UI Multi-cloud compute Platform- and task-level performance metrics Cost tracking and optimization Aut

In [215]:
prompt = ""
prompt += 'search results:\n\n'
for c in topn_chunks:
    prompt += c + '\n\n'

# stolen: https://github.com/bhaskatripathi/pdfGPT/blob/main/api.py#L137C5-L146C6
prompt += (
    "Instructions: Only reply to the query based on the search results given. "
    "Cite each reference using [ Page Number ] notation "
    "(every result has this number at the beginning). "
    "Weave responses into a coherent and succinct paragraph. "
    "Citation should be done in the same words that it refers to in Markdown. "
    "Only include information found in the results and "
    "Only answer what is asked. The answer should be short and concise. "
    "Return a JSON object with the following format: \n\n"
    
    "{\n"
    f'  "query": "{question}",\n'
    '  "answer": "Answer here."\n'
    "}\n\n"

    "Answer step-by-step. Include the page number in the most relevant citations. "

    "\n\n{\n"
    f'  "query": "{question}",\n'
    '  "answer":'
    "\n"
)

In [216]:
print(prompt)

search results:

[Page no. 8] "click - or more often, through a CI/CD system. Develop production-ready workflows quickly with open- source Metaflow that has been battle-hardened for years at Netflix and other leading companies. Deploy workflows with a single click and make them run automatically in stable, isolated execution environments, connected to other systems upstream and downstream. Build increasingly advanced systems incrementally by composing larger flows from individual components, dividing responsibilities across teams. Focus on operating your data, models, and applications with full visibility - Outerbounds keeps the foundational infrastructure running. start foreach_account process_account_date branch_step_pages foreach_one_step_page join_foreach_one_step_page branch_node_pages foreach_node_day_page create_history_view Runs Daily data 1 Daily data 2 Daily data 3 + Trigger + Trigger + Trigger Deployments Outerbounds APP 5.20 PM TrainingFlow/argo-3c245e6 succeeded Flow compl

In [217]:
message_history = [
    {
        "role": "system",
        "content": "You are an elite professor specializing in machine learning. " + \
                   "Discuss topics related to the search results, and no others.",
    },
    {
        "role": "user",
        "content": prompt,
    }
]

In [218]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=message_history,
    response_format={ "type": "json_object" }
)

In [221]:
import json
json.loads(completion.choices[0].message.content)

{'query': 'What does Outerbounds do?',
 'answer': 'Outerbounds provides a developer-friendly platform for ML and AI systems, spun off from Netflix in 2021. It builds on the open-source library Metaflow, which is designed for rapid development and a human-centric developer experience. Outerbounds offers a fully managed, secure, and cost-effective ML and AI platform that includes features like scalable compute, secure data integrations, and highly-available production orchestration. The platform tracks, records, and versions all data, code, and models automatically, and it allows for easy deployment and operation of system variants for tasks like A/B testing. Additionally, it provides tools for visualizing custom metrics and KPIs with real-time dashboards and reports [Page no. 3, Page no. 8, Page no. 15].'}

## Fetch remote data

In [None]:
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "pdfs/llama2.pdf"

In [31]:
import subprocess
def download_and_open(url: str, path: str) -> FitzDocument:
    # !wget --user-agent "Mozilla" "{url}" -O "{path}"
    subprocess.run(["wget", "--user-agent", "Mozilla", url, "-O", path])
    return fitz.open(path)

In [32]:
doc = download_and_open("https://arxiv.org/pdf/2307.09288.pdf", "pdfs/llama2.pdf")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2024-06-29 12:02:53--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 151.101.131.42, 151.101.3.42, 151.101.195.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.131.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2307.09288 [following]
--2024-06-29 12:02:53--  http://arxiv.org/pdf/2307.09288
Connecting to arxiv.org (arxiv.org)|151.101.131.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘pdfs/llama2.pdf’

     0K .......... .......... .......... .......... ..........  0% 3.56M 4s
    50K .......... .......... .......... .......... ..........  

In [34]:
type(doc)

pymupdf.Document

## Split and embed

In [10]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

In [11]:
from llama_index.core.node_parser import SentenceSplitter

In [12]:

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [13]:
text_chunks = []; doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [14]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(text=text_chunk)
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [15]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

## Load nodes into vector stores

In [17]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core import StorageContext
from IPython.display import Markdown, display

vector_store = DuckDBVectorStore()
vector_store.add(nodes)

['6b9759b1-d9f7-497f-aaca-5d2ab7caf131',
 'a627933b-4163-42e2-ac37-fcd2456e08da',
 '43831fa7-8e73-43b3-b3df-6ca8188d8e11',
 '2d884c06-af36-4766-b607-79d8f268a273',
 'a3766d1a-1a20-45b8-96e5-d5151a0a196e',
 '7c3f3309-8284-4a9c-ab8b-1f63e2f174b0',
 '1210d171-7b44-4071-92d4-399f2f4262b3',
 '19bdd5e9-58f2-4499-9c78-27263bb62129',
 '5e22e169-37f2-4135-8136-0df46326852e',
 '09c78565-3901-4fe1-8688-238837eafc70',
 '53eb5d2a-6200-402b-8908-5280320eb720',
 'f9195249-88b0-4c3a-bab8-9f43158e0424',
 'e8905a70-ab74-4d49-9c90-9018753cbb65',
 '3000b8e6-9944-48ba-a28c-aed145ce6434',
 '80b8b29a-caad-43d4-bf48-b38654e53209',
 'ec60da24-79f8-40d6-a176-44a12498ee4c',
 'bf62e53c-949a-453f-8866-d974c942f97e',
 'cb76a6c5-b6ea-4e27-8c4d-8b83de257d39',
 '225ac8dc-190d-431d-a7f1-3fead2ac5d76',
 '5fbf5fdf-8a88-482f-b0fd-4ef792370eb8',
 'befcf16a-388d-42a0-9bda-f0214ccbf36f',
 '089d9dfa-eb6d-4c9f-9991-4cedb5fa096c',
 '83fb5f0a-4d29-4ce3-8ca8-e4b09c2fa637',
 'ec375fa3-5796-40e6-97b8-be0164944d90',
 'ae88c8f2-140c-

In [18]:
query_str = "Can you tell me about the key concepts for safety finetuning"

In [19]:
query_embedding = embed_model.get_query_embedding(query_str)

In [20]:
from llama_index.core.vector_stores import VectorStoreQuery
query_mode = "default"
vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=2, mode=query_mode
)

In [22]:
query_result = vector_store.query(vector_store_query)
# print(query_result.nodes[0].get_content())

In [23]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

In [24]:
nodes_with_scores

[NodeWithScore(node=TextNode(id_='6f3cc4b6-ab87-4740-8047-13a8b866ba3a', embedding=None, metadata={'total_pages': 77, 'file_path': 'pdfs/llama2.pdf', 'source': '23'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='TruthfulQA ↑\nToxiGen ↓\nMPT\n7B\n29.13\n22.32\n30B\n35.25\n22.61\nFalcon\n7B\n25.95\n14.53\n40B\n40.39\n23.44\nLlama 1\n7B\n27.42\n23.00\n13B\n41.74\n23.08\n33B\n44.19\n22.57\n65B\n48.71\n21.77\nLlama 2\n7B\n33.29\n21.25\n13B\n41.86\n26.10\n34B\n43.45\n21.19\n70B\n50.18\n24.60\nTable 11: Evaluation of pretrained LLMs on automatic safety benchmarks. For TruthfulQA, we present the\npercentage of generations that are both truthful and informative (the higher the better). For ToxiGen, we\npresent the percentage of toxic generations (the smaller, the better).\nBenchmarks give a summary view of model capabilities and behaviors that allow us to understand general\npatterns in the model, but they do not provide a fully comprehensive view of t

In [26]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: DuckDBVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [27]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)

In [29]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.llama_cpp import LlamaCPP

model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=100,
    context_window=3900,
    generate_kwargs={},
    # model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

ModuleNotFoundError: No module named 'llama_index.llms.llama_cpp'