In [1]:
# Initialize and check health
from client import R2RBackend

client = R2RBackend()
response = client.health()
print(f'Health check response: {response}')

Health check response: ok


In [None]:
# File ingestion
from pathlib import Path
from utility.file_utils import iterate_over_files

folder_path = Path('data')
files = list(iterate_over_files(folder_path))
client.ingest_files(files) # Does not return anything. Logs can be traced to see which files were not ingested.

In [None]:
# Chunks ingestion
urls = [
    "https://docs.streamlit.io/deploy/tutorials/docker#create-a-dockerfile", 
    "https://medium.com/@ypredofficial/rag-based-conversational-chatbot-using-streamlit-364c4c02c2f1", 
    "https://blog.gopenai.com/building-a-multi-pdf-rag-chatbot-langchain-streamlit-with-code-d21d0a1cf9e5",
    "https://medium.com/@pritubera/building-a-streamlit-chatbot-for-technical-questions-with-llamaindex-and-openai-26808c841dab",
    "https://r2r-docs.sciphi.ai/cookbooks/ingestion",
    "https://docs.streamlit.io/get-started/fundamentals/advanced-concepts",
    "https://de.wikipedia.org/wiki/Deutschland"
]

from utility.scraper import Scraper
from utility.splitter import Splitter

scraper = Scraper()
splitter = Splitter()

documents = scraper.fetch_documents(urls)
split_documents = splitter.split_documents(documents)

if split_documents is None:
    print("No chunks found for any URL!")
    exit()

# Go over all the urls.
for url in urls:
    chunks = [split_doc for split_doc in split_documents if split_doc.metadata['source'] == url]
    # If chunks are found for the corresponding url ingest them.
    if chunks:
        metadata = chunks[0].metadata
        chunks_text = [{"text": chunk.page_content} for chunk in chunks]
        try:
            resp = client.ingest_chunks(chunks_text, metadata)
            print(resp)
        except Exception as e:
            pass
    else:
        print(f"No chunks found for [{url}]!")
        continue 