https://docs.llamaindex.ai/en/stable/examples/embeddings/custom_embeddings/ 

In [1]:
# from typing import Any, List
# from InstructorEmbedding import INSTRUCTOR

# from llama_index.core.bridge.pydantic import PrivateAttr
# from llama_index.core.embeddings import BaseEmbedding


# class InstructorEmbeddings(BaseEmbedding):
#     _model: INSTRUCTOR = PrivateAttr()
#     _instruction: str = PrivateAttr()

#     def __init__(
#         self,
#         instructor_model_name: str = "hkunlp/instructor-large",
#         instruction: str = "Represent a document for semantic search:",
#         **kwargs: Any,
#     ) -> None:
#         self._model = INSTRUCTOR(instructor_model_name)
#         self._instruction = instruction
#         super().__init__(**kwargs)

#     @classmethod
#     def class_name(cls) -> str:
#         return "instructor"

#     async def _aget_query_embedding(self, query: str) -> List[float]:
#         return self._get_query_embedding(query)

#     async def _aget_text_embedding(self, text: str) -> List[float]:
#         return self._get_text_embedding(text)

#     def _get_query_embedding(self, query: str) -> List[float]:
#         embeddings = self._model.encode([[self._instruction, query]])
#         return embeddings[0]

#     def _get_text_embedding(self, text: str) -> List[float]:
#         embeddings = self._model.encode([[self._instruction, text]])
#         return embeddings[0]

#     def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
#         embeddings = self._model.encode(
#             [[self._instruction, text] for text in texts]
#         )
#         return embeddings

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding()#cache_folder="./tmp/")
Settings.embed_model

  from .autonotebook import tqdm as notebook_tqdm


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en
Load pretrained SentenceTransformer: BAAI/bge-small-en
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


HuggingFaceEmbedding(model_name='BAAI/bge-small-en', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001A080132910>, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Load documents from a directory
documents = SimpleDirectoryReader('data').load_data()


In [5]:
# Create an index from the documents
index = VectorStoreIndex.from_documents(documents,show_progress=True)

Parsing nodes: 100%|██████████| 66/66 [00:00<00:00, 1759.15it/s]
Batches: 100%|██████████| 1/1 [00:08<00:00,  8.38s/it] ?it/s]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.78s/it]0:46,  1.19it/s]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.91s/it]0:28,  1.59it/s]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.06s/it]0:20,  1.77it/s]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.05s/it]0:14,  1.84it/s]
Batches: 100%|██████████| 1/1 [00:07<00:00,  7.05s/it]0:08,  1.89it/s]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.37s/it]0:03,  1.70it/s]
Generating embeddings: 100%|██████████| 66/66 [00:39<00:00,  1.66it/s]


In [6]:
# setup prompts - specific to StableLM
from llama_index.core import PromptTemplate

# This will wrap the default prompts that are internal to llama-index
# taken from https://huggingface.co/Writer/camel-5b-hf
query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

In [17]:
from llama_index.llms.huggingface import HuggingFaceLLM,HuggingFaceInferenceAPI
import torch

# MODEL = "thenlper/gte-small"
MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL = "Writer/camel-5b-hf"

llm = HuggingFaceInferenceAPI(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=MODEL, #"Writer/camel-5b-hf",
    model_name=MODEL, #"Writer/camel-5b-hf",
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.chunk_size = 512
Settings.llm = llm

In [20]:
# Create a query engine from the index

query_engine = index.as_query_engine()
# query_engine = index.as_query_engine(streaming=True)

# Query the engine
# response = query_engine.query("What punishsment for robbery ?")
response = query_engine.query("give some flower names ?")
print(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.42it/s]
