In [None]:
import chromadb
import gc
import torch
import langchain
import langchain_community
import langchain_huggingface
import langchain_text_splitters
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from uuid import uuid4

In [None]:

model = init_chat_model(
    "microsoft/Phi-3-mini-4k-instruct",
    model_provider="huggingface",
    temperature=0.7,
    max_tokens=1024,
)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
client = chromadb.HttpClient(host="localhost", port=8000)

In [None]:
# Data Collection
# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")

In [None]:
print(docs[0].page_content[:500])

In [None]:
# Splitting Documents for Vectorstore
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")


In [None]:
all_splits[0].page_content

In [None]:
all_splits[0].metadata['start_index']

In [None]:
all_splits[0].metadata['source']

In [None]:
# rag_ingest.py
def chunk_id(file_path: str, chunk_index: int) -> str:
    return f"{file_path}::chunk::{chunk_index}"

In [None]:
example_collection = client.get_collection(name="example_collection")

In [None]:
# Creating Document ids
ids = [chunk_id(doc.metadata['source'], doc.metadata['start_index']) for doc in all_splits]

In [None]:
# Convert Document.page_content into str for upsert
page_content = [page.page_content for page in all_splits]

In [None]:
# Storing Documents into Vectorstore
document_ids = example_collection.upsert(ids=ids, documents=page_content)



In [None]:
ids = example_collection.get(ids)