## Dependencies

In [1]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [8]:
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import re
import os
import json
import subprocess
import requests


try:
    import pymupdf as fitz  # available with v1.24.3
except ImportError:
    import fitz

import numpy as np
from openai import OpenAI
from fitz import Document as FitzDocument
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv

load_dotenv()

  from tqdm.autonotebook import tqdm, trange


True

## Load data

In [9]:
pdf_path = "./pdfs/outerbounds-brief.pdf"

In [10]:
doc = fitz.open(pdf_path)
assert doc.is_pdf

In [11]:
print(f"Number of pages: {doc.page_count}")
print(f"Metadata: ", end="")
pprint(doc.metadata)

Number of pages: 18
Metadata: {'author': '',
 'creationDate': "D:20240130113640-08'00'",
 'creator': 'Acrobat Pro 23.8.20470',
 'encryption': None,
 'format': 'PDF 1.7',
 'keywords': '',
 'modDate': "D:20240130113744-08'00'",
 'producer': 'Acrobat Pro 23.8.20470',
 'subject': '',
 'title': '',
 'trapped': ''}


In [12]:
pprint(doc.get_toc())

[[1, 'Cover', 1],
 [1, '1', 2],
 [1, '2', 3],
 [1, '3', 4],
 [1, '4', 5],
 [1, '5', 6],
 [1, '6', 7],
 [1, '7', 8],
 [1, '8', 9],
 [1, '9', 10],
 [1, '10', 11],
 [1, '11', 12],
 [1, '12', 13],
 [1, '13', 14],
 [1, '14', 15],
 [1, '15', 16],
 [1, '16', 17],
 [1, 'Back', 18]]


## Convert to text

In [13]:
def preprocess(text):
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text


def pdf_to_text(path, start_page=1, end_page=None):
    doc = fitz.open(path)
    total_pages = doc.page_count
    if end_page is None:
        end_page = total_pages
    text_list = []
    for i in range(start_page - 1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append({"content": text, "page": i + 1})
    doc.close()
    return text_list


def text_to_chunks(texts, word_length=150, start_page=1):
    text_toks = [(t["content"].split(" "), t["page"]) for t in texts]
    chunks = []

    for idx, words_and_page in enumerate(text_toks):
        words = words_and_page[0]
        page = words_and_page[1]
        for i in range(0, len(words), word_length):
            chunk = words[i : i + word_length]
            if (
                (i + word_length) > len(words)
                and (len(chunk) < word_length)
                and (len(text_toks) != (idx + 1))
            ):
                # text_toks[idx + 1] = chunk + text_toks[idx + 1]
                text_toks[idx + 1] = (
                    chunk + text_toks[idx + 1][0],
                    text_toks[idx + 1][1],
                )
                continue
            chunk = " ".join(chunk).strip()
            chunk = f"[Page no. {idx+start_page}]" + " " + '"' + chunk + '"'
            chunks.append((chunk, page))

    return chunks

In [27]:
TEXT_EMBEDDING_MODEL_INFO = {
    "model_name": "all-MiniLM-L6-v2",
    "model_framework": "sentence-transformers",
    "pretrained_model_provider": "Hugging Face",
    "use_case": "text-semantic-search",
}


class SemanticSearchModel:
    """
    Manager for a semantic search model.

    args:
        None

    methods:
        fit(data: List[str], batch: int, n_neighbors: int) -> None:
            Fits the model M with the data.
        _get_text_embedding(texts: List[str], batch: int) -> np.ndarray:
            Returns the embeddings of the text.
    """

    def __init__(self):
        self.embedding_model = SentenceTransformer(
            TEXT_EMBEDDING_MODEL_INFO["model_name"]
        )
        self.fitted = False

    def _get_text_embedding(self, texts, batch_size=1000):
        """
        Gather a stack of embedded texts, packed batch_size at a time.
        """
        embeddings = []
        n_texts = len(texts)
        for batch_start_idx in range(0, n_texts, batch_size):
            text_batch = texts[batch_start_idx : (batch_start_idx + batch_size)]
            embedding_batch = self.embedding_model.encode(text_batch)
            embeddings.append(embedding_batch)
        print("[DEBUG] Embedding batches:", len(embeddings))
        embeddings = np.vstack(embeddings)
        print("[DEBUG] Embedding reshaped:", embeddings.shape)
        return embeddings

    def fit(self, data, batch_size=1000, n_neighbors=6):
        """
        The only public method in this class.
        Fits the model with the data when a new PDF is uploaded.
        """
        self.data = data
        self.embeddings = self._get_text_embedding(data, batch_size=batch_size)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        print(
            "[DEBUG] Fitting Nearest Neighbors model with %s neighbors." % n_neighbors
        )
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        print("[DEBUG] Fit complete.")
        self.fitted = True

    def __call__(self, text, return_data=True):
        """
        Inference time method.
        Return the nearest neighbors of a new text.
        """
        print("[DEBUG] Getting nearest neighbors of text:", text)
        embedding = self.embedding_model.encode([text])
        print("[DEBUG] Embedding:", embedding.shape)
        neighbors = self.nn.kneighbors(embedding, return_distance=False)[0]
        if return_data:
            return [self.data[text_neighbs] for text_neighbs in neighbors]
        else:
            return neighbors

In [28]:
text_ls = pdf_to_text(pdf_path)

In [29]:
text_ls[:5]

[{'content': 'ML/January 2024 A developer-friendly platform for ML+AI systems ',
  'page': 1},
 {'content': 'Background 1 Outerbounds was spun off from Netflix in 2021. At Netflix, Outerbounds’ founders led ML and AI infrastructure, encoding the best practices of rapid ML/ AI development into an open-source library Metaflow, with a particular focus on human-centric, productivity- boosting developer experience. In addition to powering most ML/AI projects at Netflix today, Metaflow has become an industry-standard tool for production ML/AI systems, adopted by hundreds of leading companies. It powers a wide range of use cases from financial fraud detection and biotech to autonomous drones and custom large language models. Outerbounds builds on the foundation laid by Metaflow by offering it as a part of a fully managed, secure, cost- effective ML and AI platform. ',
  'page': 2},
 {'content': 'Scenario Continuously updating ML with structured data 2 Let’s take a look at a typical business-o

In [30]:
chunks = text_to_chunks(text_ls)

In [31]:
display(Markdown(chunks[0][0]))

[Page no. 3] "ML/January 2024 A developer-friendly platform for ML+AI systems  Background 1 Outerbounds was spun off from Netflix in 2021. At Netflix, Outerbounds’ founders led ML and AI infrastructure, encoding the best practices of rapid ML/ AI development into an open-source library Metaflow, with a particular focus on human-centric, productivity- boosting developer experience. In addition to powering most ML/AI projects at Netflix today, Metaflow has become an industry-standard tool for production ML/AI systems, adopted by hundreds of leading companies. It powers a wide range of use cases from financial fraud detection and biotech to autonomous drones and custom large language models. Outerbounds builds on the foundation laid by Metaflow by offering it as a part of a fully managed, secure, cost- effective ML and AI platform.  Scenario Continuously updating ML with structured data 2 Let’s take a look at a typical business-oriented ML system. The system ingests data from a"

In [32]:
recommender = SemanticSearchModel()

In [33]:
recommender.fit([c[0] for c in chunks])

[DEBUG] Embedding batches: 1
[DEBUG] Embedding reshaped: (18, 384)
[DEBUG] Fitting Nearest Neighbors model with 6 neighbors.
[DEBUG] Fit complete.


In [34]:
question = "What does Outerbounds do?"

In [35]:
topn_chunks = recommender(question)

[DEBUG] Getting nearest neighbors of text: What does Outerbounds do?
[DEBUG] Embedding: (1, 384)


In [36]:
topn_chunks

['[Page no. 4] "Based on our experience from working with hundreds of companies, real-world ML and AI systems end up including a these four foundational layers of infrastructure - sometimes organically, sometimes by design: There are many valid technical solutions to each of these layers. While not all approaches are equal, ultimately human factors - the ease of experimentation, development, and operations - tend to dominate the effectiveness of the overall solution. Outerbounds provides a full stack of ML/AI infrastructure, addressing the above layers holistically - take a look how. Accessing data efficiently and securely. Data Leveraging compute resources to process data, train models, and run inference. Compute Orchestrating the system, keeping it running in a highly- available manner. Orchestration Observing and keeping track of code, data, and models across experiments and production. Tracking and Versioning Enabling developers to experiment rapidly, develop effectively, ship to p

In [37]:
chunks[13]

('[Page no. 15] "How does the managed Outerbounds platform differ from open-source Metaflow? Outerbounds Developer-friendly API Same open-source Metaflow Yes Yes Yes Yes Basic version in OSS Basic version in OSS Basic version in OSS Basic version in OSS Included Included Included Included Included Included Included Included Included Included Included Included w/ additional features Managed and optimized Managed and optimized Same open-source Metaflow Same open-source Metaflow Same open-source Metaflow Same open-source Metaflow No lock-in, build apps with open-source APIs Version and track everything Simple access to scalable compute Deploy to production with a single click Deploys securely in your cloud account Unlimited compute at no extra cost Secure data integrations Scalable compute backend Highly-available production orchestration Durable metadata Cloud workstations Comprehensive UI Multi-cloud compute Platform- and task-level performance metrics Cost tracking and optimization Aut

In [38]:
prompt = ""
prompt += "search results:\n\n"
for c in topn_chunks:
    prompt += c + "\n\n"

# stolen: https://github.com/bhaskatripathi/pdfGPT/blob/main/api.py#L137C5-L146C6
prompt += (
    "Instructions: Only reply to the query based on the search results given. "
    "Cite each reference using [ Page Number ] notation "
    "(every result has this number at the beginning). "
    "Weave responses into a coherent and succinct paragraph. "
    "Citation should be done in the same words that it refers to in Markdown. "
    "Only include information found in the results and "
    "Only answer what is asked. The answer should be short and concise. "
    "Return a JSON object with the following format: \n\n"
    "{\n"
    f'  "query": "{question}",\n'
    '  "answer": "Answer here."\n'
    "}\n\n"
    "Answer step-by-step. Include the page number in the most relevant citations. "
    "\n\n{\n"
    f'  "query": "{question}",\n'
    '  "answer":'
    "\n"
)

In [39]:
print(prompt)

search results:

[Page no. 4] "Based on our experience from working with hundreds of companies, real-world ML and AI systems end up including a these four foundational layers of infrastructure - sometimes organically, sometimes by design: There are many valid technical solutions to each of these layers. While not all approaches are equal, ultimately human factors - the ease of experimentation, development, and operations - tend to dominate the effectiveness of the overall solution. Outerbounds provides a full stack of ML/AI infrastructure, addressing the above layers holistically - take a look how. Accessing data efficiently and securely. Data Leveraging compute resources to process data, train models, and run inference. Compute Orchestrating the system, keeping it running in a highly- available manner. Orchestration Observing and keeping track of code, data, and models across experiments and production. Tracking and Versioning Enabling developers to experiment rapidly, develop effecti

In [40]:
message_history = [
    {
        "role": "system",
        "content": "You are an elite professor specializing in machine learning. "
        + "Discuss topics related to the search results, and no others.",
    },
    {
        "role": "user",
        "content": prompt,
    },
]

In [41]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
completion = client.chat.completions.create(
    model="gpt-4o", messages=message_history, response_format={"type": "json_object"}
)

In [42]:
import json

json.loads(completion.choices[0].message.content)

{'query': 'What does Outerbounds do?',
 'answer': 'Outerbounds provides a comprehensive ML/AI infrastructure that includes data access, compute resources, orchestration, and tracking/versioning. It offers a developer-friendly platform to enhance experimentation, development, and production workflows with a single API and coherent UI. Originally spun off from Netflix in 2021, it builds on the open-source Metaflow library to offer a fully managed, secure, and cost-effective platform for a wide range of use cases, including financial fraud detection, biotech, autonomous drones, and large language models [ Page no. 3 ][ Page no. 4 ][ Page no. 9 ][ Page no. 15 ].'}

## Fetch remote data

In [61]:
import subprocess


def download(url: str, path: str) -> FitzDocument:
    subprocess.run(
        ["wget", "--user-agent", "Mozilla", url, "-O", path],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )

In [71]:
pdf_path="pdfs/llama2.pdf"
download_and_open("https://arxiv.org/pdf/2307.09288.pdf", path)
pdf = fitz.open(pdf_path)

In [83]:
print(prompt)

search results:

[Page no. 4] "Based on our experience from working with hundreds of companies, real-world ML and AI systems end up including a these four foundational layers of infrastructure - sometimes organically, sometimes by design: There are many valid technical solutions to each of these layers. While not all approaches are equal, ultimately human factors - the ease of experimentation, development, and operations - tend to dominate the effectiveness of the overall solution. Outerbounds provides a full stack of ML/AI infrastructure, addressing the above layers holistically - take a look how. Accessing data efficiently and securely. Data Leveraging compute resources to process data, train models, and run inference. Compute Orchestrating the system, keeping it running in a highly- available manner. Orchestration Observing and keeping track of code, data, and models across experiments and production. Tracking and Versioning Enabling developers to experiment rapidly, develop effecti

In [84]:
text_ls = pdf_to_text(pdf_path)
chunks = text_to_chunks(text_ls)
recommender.fit([c[0] for c in chunks])

question = "What were major advances in Llama 2?"
topn_chunks = recommender(question)

prompt = ""
prompt += "search results:\n\n"
for c in topn_chunks:
    prompt += c + "\n\n"

prompt += (
    "Instructions: Only reply to the query based on the search results given. "
    "Cite each reference using [ Page Number ] notation "
    "(every result has this number at the beginning). "
    "Weave responses into a coherent and succinct paragraph. "
    "Citation should be done in the same words that it refers to in Markdown. "
    "Only include information found in the results and "
    "Only answer what is asked. The answer should be short and concise. "
    "Return a JSON object with the following format: \n\n"
    "Answer step-by-step. Include the page number in the most relevant citations. "
    "\n\n{\n"
    f'  "query": "{question}",\n'
    '  "answer":'
    "\n"
)

message_history = [
    {
        "role": "system",
        "content": "You are an elite professor specializing in machine learning. "
        + "Discuss topics related to the search results, and no others.",
    },
    {
        "role": "user",
        "content": prompt,
    },
]

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
completion = client.chat.completions.create(
    model="gpt-4o", messages=message_history, response_format={"type": "json_object"}
)

[DEBUG] Embedding batches: 1
[DEBUG] Embedding reshaped: (283, 384)
[DEBUG] Fitting Nearest Neighbors model with 6 neighbors.
[DEBUG] Fit complete.
[DEBUG] Getting nearest neighbors of text: What were major advances in Llama 2?
[DEBUG] Embedding: (1, 384)


In [85]:
print(completion.choices[0].message.content)

{
  "query": "What were major advances in Llama 2?",
  "answer": "Major advances in Llama 2 include the introduction of models with up to 70B parameters, with pretraining methodologies leveraging publicly available online sources, and fine-tuning through supervised methods followed by Reinforcement Learning with Human Feedback (RLHF) methodologies [Page no. 5]. Llama 2-Chat variants, optimized for dialogue use cases, were also released with 7B, 13B, and 70B parameters [Page no. 4]. Evaluation shows Llama 2 70B achieving results close to GPT-3.5 on MMLU and GSM8K benchmarks, and performing on par or better than PaLM (540B) on almost all benchmarks [Page no. 8]. Safety evaluations indicate improvements, with lower toxicity percentages observed in ToxiGen benchmarks and better performance in TruthfulQA benchmarks compared to its predecessors [Page no. 23]. However, the release of the 34B variant was delayed due to insufficient time for comprehensive red-teaming [Page no. 5]."
}
