https://docs.llamaindex.ai/en/stable/examples/embeddings/custom_embeddings/ 

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
import os 
from dotenv import load_dotenv
load_dotenv()

from huggingface_hub import login
print(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
login(
    token=os.getenv("HUGGINGFACEHUB_API_TOKEN"), 
    write_permission=True,
    add_to_git_credential=True,
)


In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding()#cache_folder="./tmp/")
Settings

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Load documents from a directory
documents = SimpleDirectoryReader('data').load_data()


In [None]:
# Create an index from the documents
index = VectorStoreIndex.from_documents(documents,show_progress=True)

In [None]:
# setup prompts - specific to StableLM
from llama_index.core import PromptTemplate

# This will wrap the default prompts that are internal to llama-index
# taken from https://huggingface.co/Writer/camel-5b-hf
query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

In [None]:
from llama_index.llms.huggingface import HuggingFaceLLM,HuggingFaceInferenceAPI
import torch

# MODEL = "thenlper/gte-small"
MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL = "HuggingFaceH4/zephyr-7b-alpha"
# MODEL = "Writer/camel-5b-hf"

llm = HuggingFaceInferenceAPI(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=MODEL, #"Writer/camel-5b-hf",
    model_name=MODEL, #"Writer/camel-5b-hf",
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.chunk_size = 512
Settings.llm = llm

In [None]:
# Create a query engine from the index

query_engine = index.as_query_engine()
# query_engine = index.as_query_engine(streaming=True) 

# Query the engine
# response = query_engine.query("What punishsment for robbery ?")
response = query_engine.query("give some flower names ?")
print(response)