# Retrieval Augmented Generation Playground
> This notebook acts as a tool for users to play around with vectorizing documents and using a RAG architecture to improve the responses and capabilities of an AI (LLM) for some unique purpose. The goal is to allow users to run the commands to load and vectorize their own documents and test how things such as chunking and retrieval parameters can improve the responses of LLM for unique contexts and subject matter. 

## Load your model and set it's generation parameters

In [1]:
from amas_manager_tools import *
from datasets import load_dataset
import pandas as pd
from langchain.schema import Document
import joblib
from gradio_assistant_apps import *

# Load an LLM model and set the generation parameters

model_path = "meta-llama/Llama-3.2-1B-Instruct"
# model_path = 'meta-llama/Llama-3.2-3B-Instruct'
# model_path = "NousResearch/Hermes-2-Pro-Llama-3-8B"
# model_path = 'meta-llama/Llama-3.2-3B-Instruct'
# model_path = "nvidia/Nemotron-Mini-4B-Instruct"
# model_path = "nvidia/Mistral-NeMo-Minitron-8B-Instruct"
# model_path = "mistralai/Mistral-7B-Instruct-v0.3" 
model_path = "/home/gerald/shared_space/models/MODELS/meta_llama_Llama_3p2_1B_Instruct/"

creds_json="../../data/credentials/HF_Tokens.json"
amas_assistant = AMAS_RAG_Assistant(
    model_path=model_path,
    # hf_login=True, creds_json=creds_json,
    hf_login=False, creds_json=None,
    max_tokens=4000, max_new_tokens=4000,
    verbose=True,
)

2025-06-23 16:51:04.168109: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


		--->Try count: 0
GRADIO_CACHE for gr.Image is: /home/gerald/gradio_cache
GRADIO_CACHE for gr.Image is: /home/gerald/gradio_cache
thing: 0
gpu int: 0
allocated_mem: 512
reserved_mem: 2097152
total_mem: 47697362944
free_mem: 47695265280
thing: 1
gpu int: 1
allocated_mem: 0
reserved_mem: 0
total_mem: 47697362944
free_mem: 47697362944




			Assigning model to GPU 1 with 44.42 GB free memory.




Attempting to load remote version of '/home/gerald/shared_space/models/MODELS/meta_llama_Llama_3p2_1B_Instruct/'


Device set to use cpu


Pipeline Callable Parameters: odict_keys(['text_inputs', 'kwargs'])
Pipeline Model Configuration: LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaModel"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.0",
  "use_cache": true,
  "vocab_

## Load a document, vectorize and store it for later retrieval

In [2]:
# now try to use the knexus manager to create and load a knexus

pdf_file = [
    # "../../data/knowledge_docs/FIST_3-16_06-2020_Maintenace of Power circuit breakers BOR.pdf",
    "../../data/knowledge_docs/CUI_SPEC.pdf"
]
embedding_model_name = "/home/gerald/shared_space/models/MODELS/EMBEDING_MODELS/thenlper_gte_large/"
# /home/gerald/shared_space/models/MODELS/
embedding_model_name = "/home/gerald/shared_space/models/MODELS/EMBEDING_MODELS/thenlper_gte_base/"
# embedding_model_name = "../models/MODELS/EMBEDING_MODELS/sentence_transformers_multi_qa_mpnet_base_dot_v1/" # decent
# embedding_model_name = "../models/MODELS/EMBEDING_MODELS/sentence_transformers_all_MiniLM_L6_v2/" # decent
documents, vector_store, embeddings_tool = amas_assistant.knexus_mngr.process_pdfs_to_vector_store_and_embeddings(
    pdf_paths=pdf_file, 
    embedding_model_name=embedding_model_name, 
    chunk_size=50000, chunk_overlap=200
)

loading embedding model: /home/gerald/shared_space/models/MODELS/EMBEDING_MODELS/thenlper_gte_base/
✅ Using previously saved wrapped model at: /home/gerald/shared_space/models/MODELS/EMBEDING_MODELS/thenlper_gte_base/


## Start a gradio UI & test how the RAG architecture improves responses  

## Add your assistant bot to a Gradio UI tools and create the app

In [None]:
gradio_assistant_app = RAGWorkshopUI(assistant_bot=amas_assistant, 
                                     embedding_model_name=embedding_model_name)

gradio_assistant_app.create_app_tabs()

## Start you app and start testing...

In [None]:
gradio_assistant_app.launch_app(share=True, debug=False, server_port=7869, server_name=None, system_directive=None, 
                   save_context=False, use_system_role=False)