# PRE STEPS 

## Download model / Requirements

In [None]:
# Download model or OpenAI API, install dependencies
!pip install -r requirements

In [1]:
# Download llama from HF
from transformers import AutoTokenizer, AutoModelForCausalLM

#Enter your local directory you want to store the model in
save_path = "Models/Llama-2-7b-hf"

#Specify the model you want to download from HF
hf_model = 'meta-llama/Llama-2-7b-hf'

#Instantiate the model and tokenizer (It downloads weights/architecture/parameters)
model = AutoModelForCausalLM.from_pretrained(hf_model, return_dict=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(hf_model)

#Save the model and the tokenizer in the local directory specified earlier
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.16it/s]


('Models/Llama-2-7b-hf/tokenizer_config.json',
 'Models/Llama-2-7b-hf/special_tokens_map.json',
 'Models/Llama-2-7b-hf/tokenizer.model',
 'Models/Llama-2-7b-hf/added_tokens.json',
 'Models/Llama-2-7b-hf/tokenizer.json')

## Create vectors 

In [2]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
#%% Step 1: Load PDF

loader = PyPDFLoader("/home/ec2-user/mnt/Rag_demo/RAG/Data/Dynamic_Resource_Scheduler_for_Distributed_Deep_Learning_Training_in_Kubernetes.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

In [4]:
# Split it in chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)

In [5]:
splits

['978-1-7281- 8038- 0/20/$31.00 ©2020 IEEE  \n Dynamic Resource Scheduler for Distributed Deep \nLearning Training in Kubernetes  \nMuhammad Fadhriga Bestari  \nSchool of Electrical Engineering and Informatics , ITB, \nIndonesia  \nfadhriga.bestari@gmail.com  \n Achmad Imam Kistijantoro1,2 \n1School of Electrical Engineering and Informatics , ITB, \nIndonesia  \n2University Center of Excellence on Artificial Intelligence \nfor Vision, Natural  Language Processing & Big Data \nAnalytics  (U-CoE AI-VLB), Indonesia  \nimam@ stei.itb.ac.id\n \nAnggrahita Bayu Sasmita  \nSchool of Electrical Engineering and Informatics , ITB, Indonesia  \nangga@stei.itb.ac.id  \n \n \nAbstract —Distributed deep learning is a method of machine \nlearning that is used today due to its many advantages. One of the \nmany tools used to train distributed deep learning model is Kubeflow, which runs on top of Kubernetes. Kubernetes is a \ncontainerized application orchestrator that ease the deploy ment \nprocess of

In [6]:
# Embed and vectorize and store

persist_directory = 'basic_langchain/chroma_storage'
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_texts(
    texts=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

vectordb.persist()

vectordb_loaded = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding)

  from .autonotebook import tqdm as notebook_tqdm


# USAGE : Run the Chain

In [7]:
# Import modules
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from transformers import pipeline,LlamaForCausalLM,LlamaTokenizer
from langchain.llms import HuggingFacePipeline


In [8]:
# Create the prompt
custom_prompt_template = """Use the following pieces of information to answer the user's question. Explaining the answer
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else. Give an answer in 1000 characteres at maximum please
Helpful answer:
"""

prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])


In [9]:
# Load model in memory
#Model loaded in memory
model_dir = "/home/ec2-user/mnt/Models/llama-2-7b-chat-hf"        
model = LlamaForCausalLM.from_pretrained(model_dir,ignore_mismatched_sizes=True)
tokenizer = LlamaTokenizer.from_pretrained(model_dir,ignore_mismatched_sizes=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.45it/s]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
# Pipeline for LLM
pipe= pipeline(task="text-generation", model=model, tokenizer=tokenizer, 
                         trust_remote_code=True, max_new_tokens=100, 
                         repetition_penalty=1.1, model_kwargs={"max_length": 1200, "temperature": 0.01})
        
llm_pipeline = HuggingFacePipeline(pipeline=pipe)

# RAG pipeline ( LLM + Retrieval algorithm)
rag_retrieval = RetrievalQA.from_chain_type(llm=llm_pipeline,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
                                       #return_source_documents=True,
                                       chain_type_kwargs={'prompt':prompt}
                                       )


Let's now ask a model for "DRAGON". This paper proposes a new method called "dragon". This is exactly why RAG helps, let's see what happen if we talk to a foundation model about that

In [11]:
fm_template = """Use the following pieces of information to answer the user's question. Explaining the answer
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: {question}

Only return the helpful answer below and nothing else. Give an answer in 1000 characteres at maximum please
Helpful answer:
"""

fm_prompt = PromptTemplate.from_template(fm_template)
user_question ='Tell me about DRAGON'

chain_fm = fm_prompt|llm_pipeline
chain_fm.invoke({"question": user_question})

'Dragon is a popular fantasy creature that has been depicted in various forms of media throughout history. In modern times, dragons are often portrayed as powerful, fire-breathing reptilian creatures with wings, scales, and claws. They are sometimes depicted as having magical powers or abilities, such as the ability to fly, breathe fire, or cast spells. Dragons are also often associated with hoarding treasure and are frequently featured in'

In [12]:
# LET ASK TO A RAG MODEL
response = rag_retrieval.invoke({"query": user_question})
response

{'query': 'Tell me about DRAGON',
 'result': "DRAGON is a scheduler designed by the authors of this paper to reduce resource utilization when system loading is high. Unlike Kubernetes' default scheduler, which uses a First In First Out (FIFO) algorithm, DRAGON uses an Adapted First Come First Served (AFCFS) algorithm. This means that DRAGON prioritizes jobs that come first, but doesn't guarantee that jobs will be scheduled in order. Additionally, D"}

# Using Optimized models like : Llama.cpp

In [17]:
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [18]:
llm = LlamaCpp(
    model_path="/home/ec2-user/mnt/Models/llama_cpp/llama-2-7b-chat.Q5_K_M.gguf",
    temperature=0.75,
    max_tokens=100,
    top_p=1,
    #callback_manager=callback_manager,
    n_ctx=2048  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/ec2-user/mnt/Models/llama_cpp/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.atte

llama_model_loader: - kv  12:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv  13:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv  15:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  16:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  17:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  18:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   65 tensors
llama_model_loader: - type q5_K:  193 tensors
llama_model_loader: - type q6_K:   33 tensors
llm_load_vocab: special tokens definition check successful ( 259/32000 ).
llm_load_prin

In [19]:
# RAG pipeline ( LLM + Retrieval algorithm)
rag_retrieval = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
                                       #return_source_documents=True,
                                       chain_type_kwargs={'prompt':prompt}
                                       )

In [20]:
# LET ASK TO A RAG MODEL
response = rag_retrieval.invoke({"query": user_question})
response


llama_print_timings:        load time =     120.26 ms
llama_print_timings:      sample time =      19.70 ms /   100 runs   (    0.20 ms per token,  5076.92 tokens per second)
llama_print_timings: prompt eval time =   20354.25 ms /  1276 tokens (   15.95 ms per token,    62.69 tokens per second)
llama_print_timings:        eval time =    4354.45 ms /    99 runs   (   43.98 ms per token,    22.74 tokens per second)
llama_print_timings:       total time =   25078.74 ms /  1375 tokens


{'query': 'Tell me about DRAGON',
 'result': 'DRAGON prioritizes jobs that come first, but doesn’t guarantee that jobs will be scheduled in order. This is because DRAGON uses a Adapted First Come First Served (AFCFS) scheduling algorithm instead of Kubernetes’ default scheduler, which uses a regular First In First Out (FIFO) algorithm. AFCFS still queues jobs according to their arrival time, but prioritizes the first feasible job in the queue. This'}