# PRE STEPS 

## Download model / Requirements

In [None]:
# Download model or OpenAI API, install dependencies
!pip install -r requirements.txt

In [None]:
# Download llama from HF
from transformers import AutoTokenizer, AutoModelForCausalLM

#Enter your local directory you want to store the model in
save_path = "Models/Llama-2-7b-hf"

#Specify the model you want to download from HF
hf_model = 'meta-llama/Llama-2-7b-hf'
access_token='your_token'

#Instantiate the model and tokenizer (It downloads weights/architecture/parameters)
model = AutoModelForCausalLM.from_pretrained(hf_model, return_dict=True, trust_remote_code=True, token=access_token)
tokenizer = AutoTokenizer.from_pretrained(hf_model)

#Save the model and the tokenizer in the local directory specified earlier
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

## Create vectors 

In [None]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
#%% Step 1: Load PDF

loader = PyPDFLoader("/home/ec2-user/mnt/Rag_demo/RAG/Data/Dynamic_Resource_Scheduler_for_Distributed_Deep_Learning_Training_in_Kubernetes.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

In [None]:
# Split it in chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)

In [None]:
splits

In [None]:
# Embed and vectorize and store

persist_directory = 'basic_langchain/chroma_storage'
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_texts(
    texts=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

vectordb.persist()

vectordb_loaded = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding)

# USAGE : Run the Chain

In [None]:
# Import modules
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from transformers import pipeline,LlamaForCausalLM,LlamaTokenizer
from langchain.llms import HuggingFacePipeline


In [None]:
# Create the prompt
custom_prompt_template = """Use the following pieces of information to answer the user's question. Explaining the answer
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else. Give an answer in 1000 characteres at maximum please
Helpful answer:
"""

prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])


In [None]:
# Load model in memory
#Model loaded in memory
model_dir = "/home/ec2-user/mnt/Models/llama-2-7b-chat-hf"  #DOWNLOADED FROM HUGGING FACE       
model = LlamaForCausalLM.from_pretrained(model_dir,ignore_mismatched_sizes=True)
tokenizer = LlamaTokenizer.from_pretrained(model_dir,ignore_mismatched_sizes=True)

In [None]:
# Pipeline for LLM
pipe= pipeline(task="text-generation", model=model, tokenizer=tokenizer, 
                         trust_remote_code=True, max_new_tokens=100, 
                         repetition_penalty=1.1, model_kwargs={"max_length": 1200, "temperature": 0.01})
        
llm_pipeline = HuggingFacePipeline(pipeline=pipe)

# RAG pipeline ( LLM + Retrieval algorithm)
rag_retrieval = RetrievalQA.from_chain_type(llm=llm_pipeline,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
                                       #return_source_documents=True,
                                       chain_type_kwargs={'prompt':prompt}
                                       )


Let's now ask a model for "DRAGON". This paper proposes a new method called "dragon". This is exactly why RAG helps, let's see what happen if we talk to a foundation model about that

In [None]:
fm_template = """Use the following pieces of information to answer the user's question. Explaining the answer
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: {question}

Only return the helpful answer below and nothing else. Give an answer in 1000 characteres at maximum please
Helpful answer:
"""

fm_prompt = PromptTemplate.from_template(fm_template)
user_question ='Tell me about DRAGON'

chain_fm = fm_prompt|llm_pipeline
chain_fm.invoke({"question": user_question})

In [None]:
# LET ASK TO A RAG MODEL
response = rag_retrieval.invoke({"query": user_question})
response

# Using Optimized models like : Llama.cpp

In [None]:
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [None]:
llm = LlamaCpp(
    model_path="/home/ec2-user/mnt/Models/llama_cpp/llama-2-7b-chat.Q5_K_M.gguf", # CHANGE IT TO YOUR FOLDER
    temperature=0.75,
    max_tokens=100,
    top_p=1,
    #callback_manager=callback_manager,
    n_ctx=2048  # Verbose is required to pass to the callback manager
)

In [None]:
# RAG pipeline ( LLM + Retrieval algorithm)
rag_retrieval = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
                                       #return_source_documents=True,
                                       chain_type_kwargs={'prompt':prompt}
                                       )

In [None]:
# LET ASK TO A RAG MODEL
response = rag_retrieval.invoke({"query": user_question})
response