In [2]:
import requests
from bs4 import BeautifulSoup




In [3]:
header_length = 1050
footer_length = 250

# given a PyTorch documentation URL, extract the main text and code blocks
def extract_pytorch_docs(url):
    page_text = ""
    code_text = ""
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Extract visible text
            for tag in soup(["script", "style", "nav", "header", "footer", "aside"]):
                tag.decompose()

            for node in soup.find_all(["h1", "h2", "h3", "p", "li"]):
                text = node.get_text(separator=" ", strip=True)
                if text:
                    page_text += (text + " ")
            

            for code in soup.find_all("pre"):
                code_block = code.get_text(separator=" ", strip=True)
                lines = [line.strip() for line in code_block.split(">>>") if line.strip()]
                cleaned_code = "\n".join(lines)
                code_text += (cleaned_code)

        else:
            print(f"Failed to fetch the page. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {e}")
    return page_text[header_length:(len(page_text)-footer_length)], code_text

In [4]:
url = "https://docs.pytorch.org/docs/stable/tensor_view.html"

page_text, code_text = extract_pytorch_docs(url)

print(page_text)
print()
print(code_text)

 Last Updated On: Feb 26, 2025 PyTorch allows a tensor to be a View of an existing tensor. View tensor shares the same underlying data
with its base tensor. Supporting View avoids explicit data copy, thus allows us to do fast and memory efficient
reshaping, slicing and element-wise operations. For example, to get a view of an existing tensor t , you can call t.view(...) . Since views share underlying data with its base tensor, if you edit the data
in the view, it will be reflected in the base tensor as well. Typically a PyTorch op returns a new tensor as output, e.g. add() .
But in case of view ops, outputs are views of input tensors to avoid unnecessary data copy.
No data movement occurs when creating a view, view tensor just changes the way
it interprets the same data. Taking a view of contiguous tensor could potentially produce a non-contiguous tensor.
Users should pay additional attention as contiguity might have implicit performance impact. transpose() is a common example. For ref

In [5]:
url = "https://docs.pytorch.org/docs/stable/futures.html"
page_text, code_text = extract_pytorch_docs(url)

print(page_text)
print()
print(code_text)

| Last Updated On: Jun 12, 2025 This package provides a Future type that encapsulates
an asynchronous execution and a set of utility functions to simplify operations
on Future objects. Currently, the Future type is primarily used by the Distributed RPC Framework . Wrapper around a torch._C.Future which encapsulates an asynchronous
execution of a callable, e.g. rpc_async() . It
when the Future is completed.  Multiple callbacks can be added to
the same Future , but the order in which they will be executed cannot
be guaranteed. The callback must take one argument, which is the
reference to this Future . The callback function can use the value() method to get the value. Note that if this Future is
already completed, the given callback will be run inline. We recommend that you use the then() method as it provides a way
to synchronize after your callback has completed. add_done_callback can be cheaper if your callback does not return anything. But both then() and add_done_callback use the sa

In [8]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()

API_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key = API_key)

index_name = "teacher"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text":"chunk_text"}
        }
    )

In [9]:
HOST_NAME = os.getenv("HOST_NAME")
index = pc.Index(host=HOST_NAME)

index.upsert_records(
    "__default__",
    [
        {
            "_id": "futures_text",
            "chunk_text": page_text,
        },
        {
            "_id": "futures_code",
            "chunk_text": code_text,
        }
    ]
)

  from .autonotebook import tqdm as notebook_tqdm


# Next Steps.

- Insert entire page of PyTorch into the db
- Look into ways of scraping all PyTorch documentation better
- Insert entire PyTorch into the db (lol)