image.[png](image.png)

![Llama RAG implementation.png](./Llama RAG implementation.png "Llama RAG implementation.png")

In [0]:
from langchain_core.messages import HumanMessage, SystemMessage
from databricks_langchain import ChatDatabricks


In [0]:
from dotenv import load_dotenv
_ = load_dotenv()

In [0]:
chat_model = ChatDatabricks(
    endpoint = "otc-lama-poc",
    temperature = 0,
    max_tokens=256
)
chat_mode_json = ChatDatabricks(
    endpoint = "otc-lama-poc",
    temperature = 0,
    # max_tokens=256,
    # return_json=True,
    format="json"
)

In [0]:
messages = [
    SystemMessage(content="Hello, how can I help you?"),
    HumanMessage(content="What is MOE?")
]

In [0]:
#Sample test
chat_model.invoke(messages)

In [0]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings

In [0]:
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

In [0]:
#load documents
docs = [WebBaseLoader(url).load() for url in urls]
doc_list = [item for sublist in docs for item in sublist]

In [0]:
#split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000,
    chunk_overlap=200
)

In [0]:
doc_splits = text_splitter.split_documents(doc_list)

In [0]:
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"))

In [0]:
#Create retriever
retriever = vectorstore.as_retriever(k=3)

In [0]:
retriever.invoke("agent memory")

In [0]:
#ROUTER
import json
from langchain_core.messages import HumanMessage, SystemMessage

#Prompt
router_instructions = '''You are an expert at routing a user question to a vectorstore or web search.
The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.
Return JSON format with single key, datasource, that is 'websearch' or 'vectorstore' depending on the question. 
No formatting or comments required. Pure json format.'''

#Router test
test_websearch = chat_mode_json.invoke([SystemMessage(content=router_instructions), HumanMessage(content="Who is favored to win the NFC Championship game in the 2024 season?")])

test_websearch_2 = chat_mode_json.invoke([SystemMessage(content=router_instructions), HumanMessage(content="What are the models released today for llama3.2?")])

test_vector_store = chat_mode_json.invoke([SystemMessage(content=router_instructions), HumanMessage(content="What are the types of agent memory?")])

print (
    json.loads(test_websearch.content),
    json.loads(test_websearch_2.content),
    json.loads(test_vector_store.content))



In [0]:
#Retrieval Grader

#Doc grader instructions
doc_grader_instructions ="""You are a grader assessing relevance of a retrieved document to a user question.
If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant."""

#Grade Prompt
doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 
This carefully and objectively assess whether the document contains at least some information that is relevant to the question.
Return JSON with single key, binary_score, that is 'yes' or 'no' score to indicate whether the document contains at least some information that is relevant to the question.
No formatting or comments required. Pure json format."""

#test retrieval grader
question = "What is chain of thought prompting?"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
doc_grader_prompt_formated = doc_grader_prompt.format(document=doc_txt, question=question)
result  = chat_mode_json.invoke([SystemMessage(content=doc_grader_instructions), HumanMessage(content=doc_grader_prompt_formated)])
json.loads(result.content)

In [0]:
#Generate
rag_prompt = """You are an assistant for question-answering tasks. 

Here is the context to use to answer the question:

{context} 

Think carefully about the above context. 

Now, review the user question:

{question}

Provide an answer to this questions using only the above context. 

Use three sentences maximum and keep the answer concise.

Answer:"""

#Post processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

#Test
docs = retriever.invoke(question)
docs_text = format_docs(docs)
# print (docs_text)
rag_prompt_formated = rag_prompt.format(context=docs_text, question=question)
generation = chat_model.invoke([HumanMessage(content=rag_prompt_formated)])
print (generation.content)

In [0]:
### Hallucination Grader
#Hallucination grader instructions
hallucination_grader_instructions = """You are a teacher grading a quiz. 

You will be given FACTS and a STUDENT ANSWER. 

Here is the grade criteria to follow:

(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 

(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Score:

A score of yes means that the student's answer meets all of the criteria. This is the highest (best) score. 

A score of no means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""
hallucination_grader_prompt = '''FACTS: \n\n{documents} \n\n STUDENT ANSWER:{generation}
Return JSON with two keys. binary_score is 'yes' or 'no' score to indicate whether the STUDENT ANSWER is grounded in the FACTS. And a key, explanation, that contains an explanation of the score.
No formatting or comments required. Pure json format.
'''

#test using documents generated from above
hallucination_grader_prompt_formatted = hallucination_grader_prompt.format(documents=docs_text, generation=generation.content)
result = chat_mode_json.invoke([SystemMessage(content=hallucination_grader_instructions), HumanMessage(content=hallucination_grader_prompt_formatted)])
json.loads(result.content)

In [0]:
##Answer Grader

#Answer Grader instructions
answer_grader_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION and a STUDENT ANSWER. 

Here is the grade criteria to follow:

(1) The STUDENT ANSWER helps to answer the QUESTION

Score:

A score of yes means that the student's answer meets all of the criteria. This is the highest (best) score. 

The student can receive a score of yes if the answer contains extra information that is not explicitly asked for in the question.

A score of no means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

#Grader prompt
answer_grader_prompt = ''' QUESTION: \n\n {question} \n\n STUDENT ANSWER: {generation}
Return JSON with two two keys, binary_score is 'yes' or 'no' score to indicate whether the STUDENT ANSWER meets the criteria. And a key, explanation, that contains an explanation of the score.
No formatting or comments required. Pure json format.
'''

# Test
question = "What are the vision models released today as part of Llama 3.2?"
answer = "The Llama 3.2 models released today include two vision models: Llama 3.2 11B Vision Instruct and Llama 3.2 90B Vision Instruct, which are available on Azure AI Model Catalog via managed compute. These models are part of Meta's first foray into multimodal AI and rival closed models like Anthropic's Claude 3 Haiku and OpenAI's GPT-4o mini in visual reasoning. They replace the older text-only Llama 3.1 models."

# Test using question and generation from above
answer_grader_prompt_formatted = answer_grader_prompt.format(question=question, generation=answer)

result = chat_mode_json.invoke(
    [SystemMessage(content=answer_grader_instructions), HumanMessage(content=answer_grader_prompt_formatted)]
)
json.loads (result.content)


In [0]:
from langchain_community.tools.tavily_search import TavilySearchResults
webserch_tool = TavilySearchResults(k=3)