In [1]:
import os
from dotenv import load_dotenv

def load_env_vars():
    load_dotenv()
    env_vars = {
        "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
        "HUGGINGFACE_API_KEY": os.getenv("HUGGINGFACE_API_KEY"),
        "NOMIC_EMBEDDINGS_API_KEY": os.getenv("NOMIC_EMBEDDINGS_API_KEY"),
        "TAVILY_API_KEY": os.getenv("TAVILY_API_KEY"),
        "SQLITE_DB_PATH_1": os.getenv("SQLITE_DB_PATH_1"),
        "SQLITE_DB_PATH_2": os.getenv("SQLITE_DB_PATH_2")
    }
    # Check for missing keys
    missing_keys = [key for key, value in env_vars.items() if not value]
    if missing_keys:
        raise EnvironmentError(f"Missing API keys: {missing_keys}")
    return env_vars

if __name__ == "__main__":
    load_env_vars()


In [2]:
import torch

def check_gpu():
    if torch.cuda.is_available():
        return {
            "num_gpus": torch.cuda.device_count(),
            "current_gpu": torch.cuda.get_device_name(torch.cuda.current_device()),
            "gpus": [
                {"name": torch.cuda.get_device_name(i),
                 "memory_gb": torch.cuda.get_device_properties(i).total_memory / 1e9,
                 "compute_capability": torch.cuda.get_device_capability(i)}
                for i in range(torch.cuda.device_count())
            ]
        }
    else:
        return None

if __name__ == "__main__":
    gpu_info = check_gpu()
    if gpu_info:
        print(f"Using GPU: {gpu_info['current_gpu']}")
    else:
        print("No GPU available. Using CPU.")


Using GPU: NVIDIA GeForce RTX 4090


In [None]:
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

def setup_llm_and_embeddings(env_vars):
    # Initialize the Ollama local model
    llm = ChatOllama(model="llama3.2", temperature=0)

    # Initialize Nomic embeddings
    embeddings_model = OllamaEmbeddings(model="llama3.2")
    
    return llm, embeddings_model

if __name__ == "__main__":
    # from environment_setup import load_env_vars
    env_vars = load_env_vars()
    print("env_vars =", env_vars)
    setup_llm_and_embeddings(env_vars)


In [4]:
import sqlite3

class SQLiteMetadataRetriever:
    def __init__(self, db_path):
        self.connection = sqlite3.connect(db_path)
        self.cursor = self.connection.cursor()

    def get_tables(self):
        query = "SELECT name FROM sqlite_master WHERE type='table';"
        self.cursor.execute(query)
        return [row[0] for row in self.cursor.fetchall()]

    def get_table_metadata(self, table_name):
        query = f"PRAGMA table_info('{table_name}');"
        self.cursor.execute(query)
        columns = [{'column_name': row[1], 'data_type': row[2], 'not_null': row[3], 'default_value': row[4], 'primary_key': row[5]} for row in self.cursor.fetchall()]

        query = f"PRAGMA foreign_key_list('{table_name}');"
        self.cursor.execute(query)
        foreign_keys = [{'id': row[0], 'referenced_table': row[2], 'from_column': row[3], 'to_column': row[4]} for row in self.cursor.fetchall()]

        return {'table_name': table_name, 'columns': columns, 'foreign_keys': foreign_keys}

    def close(self):
        self.cursor.close()
        self.connection.close()

if __name__ == "__main__":
    # from environment_setup import load_env_vars
    env_vars = load_env_vars()
    retriever = SQLiteMetadataRetriever(env_vars["SQLITE_DB_PATH_1"])
    print(retriever.get_tables())
    retriever.close()


['Users', 'Orders']


In [5]:
import nltk
nltk.download('punkt')
from langchain.schema import Document

def proposition_chunking(documents):
    chunked_docs = []
    for doc in documents:
        sentences = nltk.sent_tokenize(doc.page_content)
        for sentence in sentences:
            chunked_doc = Document(page_content=sentence, metadata=doc.metadata)
            chunked_docs.append(chunked_doc)
    return chunked_docs

if __name__ == "__main__":
    # Example with a single document
    document = Document(page_content="This is a test document. It has two sentences.", metadata={})
    chunked_docs = proposition_chunking([document])
    for doc in chunked_docs:
        print(doc.page_content)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/opt/conda/envs/data_science_ollama/nltk_data'
    - '/opt/conda/envs/data_science_ollama/share/nltk_data'
    - '/opt/conda/envs/data_science_ollama/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [12]:
from langchain.vectorstores import FAISS

def create_vector_store(documents, embeddings_model):
    vector_store = FAISS.from_documents(documents, embeddings_model)
    return vector_store

if __name__ == "__main__":
    # from environment_setup import load_env_vars
    # from llm_setup import setup_llm_and_embeddings
    # from chunking import proposition_chunking
    env_vars = load_env_vars()
    llm, embeddings_model = setup_llm_and_embeddings(env_vars)
    
    document = Document(page_content="This is a test document.", metadata={})
    chunked_docs = proposition_chunking([document])
    vector_store = create_vector_store(chunked_docs, embeddings_model)


In [13]:
def build_retriever(vector_store):
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    return retriever

if __name__ == "__main__":
    # from vector_store import create_vector_store
    # from llm_setup import setup_llm_and_embeddings
    # from chunking import proposition_chunking
    env_vars = load_env_vars()
    llm, embeddings_model = setup_llm_and_embeddings(env_vars)
    
    document = Document(page_content="This is a test document.", metadata={})
    chunked_docs = proposition_chunking([document])
    vector_store = create_vector_store(chunked_docs, embeddings_model)
    retriever = build_retriever(vector_store)
    print(retriever.get_relevant_documents("test"))


  print(retriever.get_relevant_documents("test"))


[Document(metadata={}, page_content='This is a test document.')]


In [14]:
from langchain.chains import RetrievalQA

def build_rag_pipeline(retriever, llm):
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True  # This returns both the result and the source documents
    )
    return qa_chain

if __name__ == "__main__":
    env_vars = load_env_vars()
    llm, embeddings_model = setup_llm_and_embeddings(env_vars)

    # Create vector store
    document = Document(page_content="This is a test document.", metadata={})
    chunked_docs = proposition_chunking([document])
    vector_store = create_vector_store(chunked_docs, embeddings_model)

    # Build retriever and RAG pipeline
    retriever = build_retriever(vector_store)
    qa_chain = build_rag_pipeline(retriever, llm)
    
    # Use invoke instead of run to handle multiple outputs
    response = qa_chain.invoke({"query": "What is the test document about?"})
    
    # Access the result and source_documents
    result = response["result"]
    source_documents = response["source_documents"]

    print("Result:", result)
    print("Source Documents:", source_documents)


Result: I don't know what this test document is specifically about. It appears to be a placeholder or a test document used for evaluation purposes, but I don't have any additional information about its content or purpose.
Source Documents: [Document(metadata={}, page_content='This is a test document.')]


In [15]:
from langchain_community.retrievers import TavilySearchAPIRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Initialize the retriever with k=3 to retrieve 3 search results
retriever = TavilySearchAPIRetriever(k=3)

# Modify the prompt to restrict the LLM from making up information
prompt = ChatPromptTemplate.from_template(
    """Answer the question based only on the context provided. Do not make up additional information.

    Context: {context}

    Question: {question}"""
)

# Function to format retrieved documents with source URLs
from langchain_community.retrievers import TavilySearchAPIRetriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

def setup_tavily_search_retriever(k=3):
    # Initialize the Tavily retriever with specified search depth
    retriever = TavilySearchAPIRetriever(k=k)

    # Define the prompt template to avoid hallucination
    prompt = ChatPromptTemplate.from_template(
        """Answer the question based only on the context provided. Do not make up additional information.
        
        Context: {context}
        
        Question: {question}"""
    )
    return retriever, prompt


def format_docs(docs):
    for i, doc in enumerate(docs):
        print(f"DEBUG: Retrieved Document {i+1}:")
        print(f"DEBUG: {doc.page_content}\n")
        print(f"DEBUG: Source URL: {doc.metadata.get('source')}\n")  # Show source URL if available
    return "\n\n".join(f"Source {i+1} (URL: {doc.metadata.get('source')}):\n{doc.page_content}" for i, doc in enumerate(docs))

if __name__ == "__main__":
    retriever, prompt = setup_tavily_search_retriever(k=3)

In [16]:
# from tavily_search import setup_tavily_search_retriever, format_docs
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import RetrievalQA
from langchain_core.output_parsers import StrOutputParser

def build_rag_pipeline(retriever, llm, tavily_retriever, tavily_prompt):
    # SQLite retrieval-based QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

    # Tavily Search chain
    tavily_chain = (
        {"context": tavily_retriever | format_docs, "question": RunnablePassthrough()}
        | tavily_prompt
        | llm
        | StrOutputParser()
    )

    return qa_chain, tavily_chain


if __name__ == "__main__":
    # from environment_setup import load_env_vars
    # from llm_setup import setup_llm_and_embeddings
    # from vector_store import create_vector_store
    # from chunking import proposition_chunking
    # from retriever import build_retriever
    from langchain.schema import Document

    # Load environment and set up LLM/embeddings
    env_vars = load_env_vars()
    llm, embeddings_model = setup_llm_and_embeddings(env_vars)

    # Create vector store from SQLite metadata
    document = Document(page_content="This is a test document.", metadata={})
    chunked_docs = proposition_chunking([document])
    vector_store = create_vector_store(chunked_docs, embeddings_model)

    # Set up SQLite retriever
    retriever = build_retriever(vector_store)

    # Set up Tavily search retriever and prompt
    tavily_retriever, tavily_prompt = setup_tavily_search_retriever()

    # Build the RAG pipeline with both retrievals
    qa_chain, tavily_chain = build_rag_pipeline(retriever, llm, tavily_retriever, tavily_prompt)

    # Run sample queries
    sqlite_response = qa_chain.invoke({"query": "What is the test document about?"})
    tavily_response = tavily_chain.invoke({"question": "What is the price of AirPods Max on Amazon?"})

    # Output results
    print("SQLite Response:", sqlite_response["result"])
    print("Tavily Response:", tavily_response)


HTTPError: 422 Client Error: Unprocessable Entity for url: https://api.tavily.com/search

In [65]:
def sample_query_1(qa_chain):
    question = "Which tables in the Oracle and SQLite databases have similar structures?"
    answer = qa_chain.run(question)
    print("Answer:")
    print(answer)


In [66]:
def sample_query_2(qa_chain):
    question = "Suggest which tables from both databases can be removed due to redundancy."
    answer = qa_chain.run(question)
    print("Answer:")
    print(answer)


In [67]:
if __name__ == "__main__":
    # Load environment variables and initialize components
    env_vars = load_env_vars()
    llm, embeddings_model = setup_llm_and_embeddings(env_vars)

    # Create vector store from SQLite metadata
    document = Document(page_content="This is a test document.", metadata={})
    chunked_docs = proposition_chunking([document])
    vector_store = create_vector_store(chunked_docs, embeddings_model)

    # Set up SQLite retriever
    retriever = build_retriever(vector_store)

    # Set up Tavily search retriever and prompt
    tavily_retriever, tavily_prompt = setup_tavily_search_retriever()

    # Build combined RAG pipeline with SQLite and Tavily retrievals
    qa_chain, tavily_chain = build_rag_pipeline(retriever, llm, tavily_retriever, tavily_prompt)

    # Run sample queries
    sqlite_response = qa_chain.invoke({"query": "What is the test document about?"})
    tavily_response = tavily_chain.invoke({"question": "What is the price of AirPods Max on Amazon?"})

    # Output results
    print("SQLite Response:", sqlite_response["result"])
    print("Tavily Response:", tavily_response)


HTTPError: 422 Client Error: Unprocessable Entity for url: https://api.tavily.com/search