## Install modules in src folder

In [None]:
# !pip install -e ../

## Ingest datasets to vector database

In [None]:
from ingest import Ingest

In [None]:
ingest = Ingest()

### Chunking deadpool data
- here we have 2 datasets. The subtitles for movie Deadpool & Wolverine as a txt file and the script for deadpool movie as a pdf. We will parse both files and chunk them into text segments up to max_tokens size.

In [None]:
txt_path = "../data/raw/deadpool/deadpool_wolverine_2024_subs.txt"
pdf_path = "../data/raw/deadpool/deadpool-2016.pdf"
max_tokens = 256

In [None]:
deadpool_chunks = ingest.pdf_to_chunks(pdf_path, max_tokens=max_tokens)
deadpool_wolverine_chunks = ingest.txt_to_chunks(txt_path, max_tokens=max_tokens)

In [None]:
# Chunk example
print(deadpool_wolverine_chunks[3])

## Add chunks to index
- at this stage we would ingest the chunks to a vector database that can store both vectors, text and metadata. For simplicity, we use here a FAISS index, and a metadata file containing the text.
- the function below creates a faiss index file and a .npy metadata in the folder specified.
- in Production, you can use Milvus, Chroma, Postgres or any supported database to store this information. here is a reference from llamaindex: https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores/#vector-store-options--feature-support

In [None]:
ingest.ingest_chunks_to_faiss_with_metadata(chunks=deadpool_chunks, index_name="deadpool", folder="../indexes", mode="append")

In [None]:
ingest.ingest_chunks_to_faiss_with_metadata(chunks=deadpool_wolverine_chunks, index_name="deadpool", folder="../indexes", mode="append")

## Test querying the index:
- now we will use the retriever function query_index to find similar documents in the collection. the query will be embedded using same embedding model used for the chunks. in the .env you can change for other models. In the example here was used `text-embedding-3-large`

In [None]:
from rag import RAG

In [None]:
rag = RAG()

In [None]:
query_text = "What does Deadpool think about superheroes?"

results = rag.query_index(query_text, index_folder="../indexes", index_name="deadpool", top_k=3)

for result in results:
    print(f"Match Index: {result['index']}")
    print(f"Distance: {result['distance']}")
    print(f"Content: {result['content']}")

### Chunking Shakespeare data

In [None]:
hamlet = "../data/raw/shakespeare/hamlet.html"
julius_caesar = "../data/raw/shakespeare/julius_caesar.html"
othello = "../data/raw/shakespeare/othello.html"
romeo_juliet = "../data/raw/shakespeare/romeo_juliet.html"

max_tokens = 256

In [None]:
hamlet_chunks = ingest.html_to_chunks(hamlet, max_tokens=max_tokens)
julius_caesar_chunks = ingest.html_to_chunks(julius_caesar, max_tokens=max_tokens)
othello_chunks = ingest.html_to_chunks(othello, max_tokens=max_tokens)
romeo_juliet_chunks = ingest.html_to_chunks(romeo_juliet, max_tokens=max_tokens)

In [None]:
print(othello_chunks[1])

## Add chunks to index

In [None]:
ingest.ingest_chunks_to_faiss_with_metadata(chunks=hamlet_chunks, index_name="shakespeare", folder="../indexes", mode="append")
ingest.ingest_chunks_to_faiss_with_metadata(chunks=julius_caesar_chunks, index_name="shakespeare", folder="../indexes", mode="append")
ingest.ingest_chunks_to_faiss_with_metadata(chunks=othello_chunks, index_name="shakespeare", folder="../indexes", mode="append")
ingest.ingest_chunks_to_faiss_with_metadata(chunks=romeo_juliet_chunks, index_name="shakespeare", folder="../indexes", mode="append")
