In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader 
from pathlib import PosixPath
from typing import Union
from dotenv import load_dotenv
from pathlib import Path
import os
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.llms.anthropic import Anthropic
from dataclasses import dataclass
import logging
import sys
# from rich import print

### Extensions for the future:
1. Use ollama for models instead of API calls.
    -> Docs mention huggingface.
2. Semantic (using LLMs itself) vs Key-word matching.

In [None]:
## If we wanted to monitor the API calls and responses within llama-index,  uncomment this section
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
load_dotenv(dotenv_path="../../project_secrets.env")
load_dotenv(dotenv_path="../../../ai_sdlc_secrets.env")

In [None]:
root_dir = Path(os.getenv("ROOT_DIR"))

In [None]:
class SimplestRAG:
    def __init__(self, data_dir: Union[str, PosixPath]):
        self.data_dir = data_dir
        self.documents = SimpleDirectoryReader(input_dir=self.data_dir).load_data() ## Similar to pd.read_csv()
        self.index = VectorStoreIndex.from_documents(self.documents, ) ## Uses open-ai-embeddings so fails without the API key.
        self.query_engine = self.index.as_query_engine(llm=None, similarity_top_k=3)

    def query(self, query: str) -> str:
        response = self.query_engine.query(query)
        return response
    

In [None]:
@dataclass
class GeminiConfig:
    model_name: str = "gemini-2.0-flash"
    temperature: float = 0.7
    max_tokens: int = 512

@dataclass
class ClaudeConfig:
    model: str = 'claude-3-7-sonnet-latest'
    temperature: float = 0.1
    max_tokens: int = 512 

class RAG_Pipeline:
    """
    A simple RAG pipeline that uses the LlamaIndex library to create a vector store index from documents in a directory and allows querying from index using configured LLM.
    """
    def __init__(self, data_dir: Union[str, PosixPath], llm_provider: str = "GoogleGenAI", llm_config: GeminiConfig = GeminiConfig()):
        self.data_dir = data_dir
        self.documents = SimpleDirectoryReader(input_dir=self.data_dir).load_data() ## Similar to pd.read_csv()
        self.index = VectorStoreIndex.from_documents(self.documents, ) ## Uses open-ai-embeddings so fails without the API key.
        
        if llm_provider == "GoogleGenAI":
            self.llm_cfg = llm_config
            self.llm = GoogleGenAI(
                model=self.llm_cfg.model_name, 
                temperature=self.llm_cfg.temperature, 
                max_tokens=self.llm_cfg.max_tokens,) 
            
        elif llm_provider == "Claude":
            self.llm_cfg = llm_config
            self.llm = Anthropic(
                model=self.llm_cfg.model,
                temperature=self.llm_cfg.temperature,
                max_tokens=self.llm_cfg.max_tokens,)
        else:
            raise Exception(f"Invalid LLM provided: {llm_provider}. Supported LLMs are: `GoogleGenAI`")

        self.query_engine = self.index.as_query_engine(llm=self.llm)

    def query(self, query: str) -> str:
        response = self.query_engine.query(query)
        return response
    

In [None]:
rag_pipeline = RAG_Pipeline(
    data_dir=root_dir, 
    llm_provider="GoogleGenAI", 
    llm_config=GeminiConfig())

# rag_pipeline = RAG_Pipeline(
#     data_dir=root_dir, 
#     llm_provider="Claude", 
#     llm_config=ClaudeConfig())

In [None]:
response = rag_pipeline.query("This is a LinkedIn profile. Give me the name, position, job history, and location of the individual as json")
response

In [None]:
print(response.response)