# Initialize/Download model

In [10]:
# Download model or OpenAI API, install dependencies
!pip install -r requirements

Collecting peft
  Using cached peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Using cached peft-0.8.2-py3-none-any.whl (183 kB)
Installing collected packages: peft
Successfully installed peft-0.8.2


In [None]:
# Download llama from HF
from transformers import AutoTokenizer, AutoModelForCausalLM

#Enter your local directory you want to store the model in
save_path = "Models/Llama-2-7b-hf"

#Specify the model you want to download from HF
hf_model = 'meta-llama/Llama-2-7b-hf'

#Instantiate the model and tokenizer (It downloads weights/architecture/parameters)
model = AutoModelForCausalLM.from_pretrained(hf_model, return_dict=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(hf_model)

#Save the model and the tokenizer in the local directory specified earlier
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Create vectors 

In [78]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [79]:
#%% Step 1: Load PDF

loader = PyPDFLoader("/home/ec2-user/mnt/Rag_demo/RAG/Data/Dynamic_Resource_Scheduler_for_Distributed_Deep_Learning_Training_in_Kubernetes.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

In [81]:
# Split it in chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)

In [82]:
splits

['978-1-7281- 8038- 0/20/$31.00 ©2020 IEEE  \n Dynamic Resource Scheduler for Distributed Deep \nLearning Training in Kubernetes  \nMuhammad Fadhriga Bestari  \nSchool of Electrical Engineering and Informatics , ITB, \nIndonesia  \nfadhriga.bestari@gmail.com  \n Achmad Imam Kistijantoro1,2 \n1School of Electrical Engineering and Informatics , ITB, \nIndonesia  \n2University Center of Excellence on Artificial Intelligence \nfor Vision, Natural  Language Processing & Big Data \nAnalytics  (U-CoE AI-VLB), Indonesia  \nimam@ stei.itb.ac.id\n \nAnggrahita Bayu Sasmita  \nSchool of Electrical Engineering and Informatics , ITB, Indonesia  \nangga@stei.itb.ac.id  \n \n \nAbstract —Distributed deep learning is a method of machine \nlearning that is used today due to its many advantages. One of the \nmany tools used to train distributed deep learning model is Kubeflow, which runs on top of Kubernetes. Kubernetes is a \ncontainerized application orchestrator that ease the deploy ment \nprocess of

In [83]:
# Embed and vectorize and store

persist_directory = 'basic_langchain/chroma_storage'
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_texts(
    texts=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

vectordb.persist()

vectordb_loaded = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding)

In [74]:
# RAG pipeline ( LLM + Retrieval algorithm)
rag_retrieval = RetrievalQA.from_chain_type(llm=llm_pipeline,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
                                       return_source_documents=True,
                                       chain_type_kwargs={'prompt':prompt}
                                       )

In [75]:
question = "What is dragon?"
result = rag_retrieval.invoke({"query": question})
result["result"]

'Dragon is a resource scheduler that schedules distributed jobs using gang scheduling and autoscaling, improving upon the scheduling capabilities of Kubernetes.'

# Run the Chain

In [84]:
# Import modules
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from transformers import pipeline,LlamaForCausalLM,LlamaTokenizer

In [85]:
# Create the prompt
custom_prompt_template = """Use the following pieces of information to answer the user's question. Explaining the answer
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else. Give an answer in 1000 characteres at maximum please
Helpful answer:
"""

prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])


In [86]:
#Load_model

#from intel_extension_for_transformers.transformers import AutoModelForCausalLM
##from transformers import AutoTokenizer
#model_dir = "/home/ec2-user/mnt/Models/Llama-7b-hf-OPTIM"        
#model = AutoModelForCausalLM.from_pretrained(model_dir,use_neural_speed=False)
#tokenizer = AutoTokenizer.from_pretrained(model_dir)

2024-02-27 22:29:12 [INFO] Find quantization_config.json, trying to load quantized low bit model...
2024-02-27 22:29:12 [INFO] quantization_config: {
  "compute_dtype": "fp32",
  "device": "cpu",
  "low_bit_model": true,
  "scale_dtype": "fp32",
  "weight_dtype": "int8"
}

2024-02-27 22:29:12 [INFO] loading weights file /home/ec2-user/mnt/Models/Llama-7b-hf-OPTIM/model.safetensors.index.json
2024-02-27 22:29:12 [ERROR] /home/ec2-user/miniconda3/lib/python3.11/site-packages/intel_extension_for_transformers/libqbits.so: undefined symbol: _ZNR5torch7Library4_defEON3c106eitherINS1_12OperatorNameENS1_14FunctionSchemaEEEONS_11CppFunctionE
2024-02-27 22:29:12 [ERROR] Saved low bit model loading failed, please check your model.


NameError: name 'exit' is not defined

In [87]:
# Load model in memory
#Model loaded in memory
model_dir = "/home/ec2-user/mnt/Models/llama-2-7b-chat-hf"        
model = LlamaForCausalLM.from_pretrained(model_dir,ignore_mismatched_sizes=True)
tokenizer = LlamaTokenizer.from_pretrained(model_dir,ignore_mismatched_sizes=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.32it/s]


In [88]:
# Pipeline for LLM
pipe= pipeline(task="text-generation", model=model, tokenizer=tokenizer, 
                         trust_remote_code=True, max_new_tokens=100, 
                         repetition_penalty=1.1, model_kwargs={"max_length": 1200, "temperature": 0.01})
        
llm_pipeline = HuggingFacePipeline(pipeline=pipe)

# RAG pipeline ( LLM + Retrieval algorithm)
rag_retrieval = RetrievalQA.from_chain_type(llm=llm_pipeline,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
                                       return_source_documents=True,
                                       chain_type_kwargs={'prompt':prompt}
                                       )


In [89]:

user_question ='Tell me about DRAGON'

response = rag_retrieval.invoke({"query": user_question})
response

{'query': 'Tell me about DRAGON',
 'result': "DRAGON is a scheduler designed by the authors of this paper to reduce resource utilization when system loading is high. Unlike Kubernetes' default scheduler, which uses a First In First Out (FIFO) algorithm, DRAGON uses an Adapted First Come First Served (AFCFS) algorithm. This means that DRAGON prioritizes jobs that come first, but doesn't guarantee that jobs will be scheduled in order. Additionally, D",
 'source_documents': [Document(page_content='function to reduce resource utilization when system loading is \nhigh. Although scale down function reduce resource utilization, \nit allows other jo bs to be scheduled, possibly increasing the \noverall resource utilization.  \nGang scheduling is a scheduling method that DRAGON use \nto reduce communication overhead in the system. Distributed \ndeep learning divide training process into two entities, i.e. \nparam eter server and workers. During the training process, \nparameter server and worke