<a href="https://colab.research.google.com/github/fullstackdata/public2024/blob/main/llamaindex_quantized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate bitsandbytes sentence-transformers llama-index llama-index-embeddings-huggingface datasets llama-index-llms-huggingface

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.huggingface import HuggingFaceLLM
import torch

#### Load data

In [None]:
from datasets import load_dataset
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

wkds = load_dataset("microsoft/wiki_qa", split="train", streaming=True)
wkitr = iter(wkds)
recs = []

for i in range(10):
  recs.append(next(wkitr))

processed_data_list = [rec['answer'] for rec in recs]

#### Chunk, embed index and store

In [7]:


bge_name = "BAAI/bge-small-en-v1.5"

bge_cache_path="/root/.cache/huggingface/hub/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a"
Settings.embed_model = HuggingFaceEmbedding(model_name=bge_name)


from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(bge_name)

splitter = SentenceSplitter(
    chunk_size=256,
    chunk_overlap=5,
    tokenizer=tokenizer
)

lngdocs = [Document(text=wktxt) for wktxt in processed_data_list]
nodes = splitter.get_nodes_from_documents(lngdocs)

index = VectorStoreIndex(nodes)

query_str = "give me an example of a circular motion?"

#### Load quantized LLM (llama3.1 - 8B )

In [5]:
import torch
llama_4_bit = "/Users/chandana/.cache/lm-studio/models/mlx-community/Llama-3.2-3B-Instruct-4bit"
gguf = "NousResearch/Llama-3.2-1B"
llama318B = "NousResearch/Meta-Llama-3.1-8B"

from transformers import AutoTokenizer

from transformers import BitsAndBytesConfig

model_used=llama318B

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    # quant_method =
)

tokenizer = AutoTokenizer.from_pretrained(
    model_used,
    trust_remote_code=True
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

llm = HuggingFaceLLM(
    model_name=model_used,
    model_kwargs={
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    },
    tokenizer_name=model_used,
    stopping_ids=stopping_ids,
)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

#### Query and generate response

In [8]:
query_str = "give me an example of a circular motion?"

query_engine = index.as_query_engine(similarity_top_k=2, llm=llm)# use this for testingvector_retriever = index.as_retriever(similarity_top_k=2)

response = query_engine.query(query_str)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1. an artificial satellite orbiting the Earth at constant height, 2. a stone which is tied to a rope and is being swung in circles, 3. a car turning through a curve in a race track, 4. an electron moving perpendicular to a uniform magnetic field, 5. a gear turning inside a mechanism.
---------------------

