<a href="https://colab.research.google.com/github/davidharrisnet/marvel_universe/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
#https://www.kaggle.com/code/gpreda/rag-using-llama-2-langchain-and-chromadb

In [1]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.0/167.0 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.5/426.5 kB[0m [31m44.4 MB

In [2]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
#import chromadb
#from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
cuda.is_available()

True

In [23]:
model_id = os.path.join("/content","drive","My Drive", "models", "Llama-2-7b-hf", "snapshots", "1")

In [15]:
os.path.exists(model_id)


True

In [16]:

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [17]:
time_1 = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prepare model, tokenizer: 218.725 sec.


In [18]:


time_1 = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")



Prepare pipeline: 0.865 sec.


In [19]:
def test_model(tokenizer, pipeline, prompt_to_test):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        prompt_to_test: the prompt
    Returns
        None
    """
    # adapted from https://huggingface.co/blog/llama2#using-transformers
    time_1 = time()
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)
    time_2 = time()
    print(f"Test inference: {round(time_2-time_1, 3)} sec.")
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [20]:


test_model(tokenizer,
           query_pipeline,
           "Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.")



Test inference: 18.805 sec.
Result: Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.
The State of the Union address is a speech given by the President of the United States to a joint session of Congress, typically delivered in January of each year. The address is meant to provide an update on the nation's progress and to outline the President's priorities and goals for the coming year. The speech is typically broadcast live on television and radio, and is attended by members of Congress, the Supreme Court, and other high-ranking officials.
What is the difference between the State of the Union and the President's budget?
The State of the Union is an annual address given by the President of the United States to a joint session of Congress, typically delivered in January of each year. The speech is meant to provide an update on the nation's progress and to outline the President's priorities and goals for the coming year.


In [42]:


llm = HuggingFacePipeline(pipeline=query_pipeline)
# checking again that everything is working fine
llm(prompt="Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.")



'\nAnswer: The State of the Union address is a speech delivered by the President of the United States to a joint meeting of Congress, typically in January each year. The address typically outlines the President’s legislative agenda for the coming year, and provides an update on the current state of the nation.\nQuestion: What is the difference between a State of the Union address and a State of the Union speech?\nAnswer: A State of the Union address is a formal speech delivered by the President of the United States to a joint meeting of Congress, typically in January each year. A State of the Union speech is a more informal speech given by the President to a smaller audience, such as a group of supporters or a town hall meeting.\nQuestion: How many times has the President given a State of the Union address?\nAnswer: The President has given a State of the Union address every year since 1913, with the exception of 1981, when President Ronald Reagan was recovering from an assassination at

In [36]:
speech_text = os.path.join("/content","drive","My Drive", "models", "speech.txt")
os.path.exists(speech_text)


True

In [37]:


loader = TextLoader(speech_text,
                    encoding="utf8")
documents = loader.load()



In [38]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [39]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

(…)c16baf96765e2ecb20bca8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

(…)6765e2ecb20bca8e1d/1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(…)a1b43c16baf96765e2ecb20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

(…)b43c16baf96765e2ecb20bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

(…)ca8e1d/config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

(…)6baf96765e2ecb20bca8e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

(…)e2ecb20bca8e1d/sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)65e2ecb20bca8e1d/special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

(…)c16baf96765e2ecb20bca8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)6765e2ecb20bca8e1d/tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

(…)16baf96765e2ecb20bca8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

(…)a1b43c16baf96765e2ecb20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)43c16baf96765e2ecb20bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [40]:


vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")



In [43]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

In [44]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

In [None]:
query = "What were the main topics in the State of the Union in 2023? Summarize. Keep it under 200 words."
test_rag(qa, query)

Query: What were the main topics in the State of the Union in 2023? Summarize. Keep it under 200 words.



[1m> Entering new RetrievalQA chain...[0m


In [None]:
docs = vectordb.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")