# basic

In [1]:
!pip install faiss-cpu



In [2]:
import faiss
import numpy as np


In [3]:
# Sample list of image embeddings (replace this with your actual list)
list_emb_img = [np.random.rand(512) for _ in range(1000)]  # 512 is the dimensionality of the CLIP embeddings


In [4]:
# Convert the list of embeddings to a numpy array
embeddings = np.array(list_emb_img).astype('float32')

# Instantiate a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Add the embeddings to the index
index.add(embeddings)


In [5]:
# Query example
query_embedding = np.random.rand(512).astype('float32')  # Example query embedding

# Perform a k-nearest neighbor search
k = 5  # Number of nearest neighbors to retrieve
distances, indices = index.search(np.array([query_embedding]), k)

print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)


Indices of nearest neighbors: [[606 746 659 423 894]]
Distances to nearest neighbors: [[70.63731 74.36014 76.24449 76.56677 76.76857]]


# via clip

In [6]:
import faiss
import numpy as np

In [15]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageFile
import requests
import torch

# We use the original clip-ViT-B-32 for encoding images
img_model = SentenceTransformer('clip-ViT-B-32')

# Our text embedding model is aligned to the img_model and maps 50+
# languages to the same vector space
text_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32-multilingual-v1')


# Now we load and encode the images
def load_image(url_or_path):
    if url_or_path.startswith("http://") or url_or_path.startswith("https://"):
        return Image.open(requests.get(url_or_path, stream=True).raw)
    else:
        return Image.open(url_or_path)

# We load 3 images. You can either pass URLs or
# a path on your disc
img_paths = [
    # Dog image
    "https://unsplash.com/photos/QtxgNsmJQSs/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjM1ODQ0MjY3&w=640",

    # Cat image
    "https://unsplash.com/photos/9UUoGaaHtNE/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8Mnx8Y2F0fHwwfHx8fDE2MzU4NDI1ODQ&w=640",

    # Beach image
    "https://unsplash.com/photos/Siuwr3uCir0/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8NHx8YmVhY2h8fDB8fHx8MTYzNTg0MjYzMg&w=640"
]

images = [load_image(img) for img in img_paths]

# Map images to the vector space
img_embeddings = img_model.encode(images)

# Now we encode our text:
texts = [
    "A dog in the snow",
    "Eine Katze",  # German: A cat
    "Una playa con palmeras."  # Spanish: a beach with palm trees
]

text_embeddings = text_model.encode(texts)

# Compute cosine similarities:
cos_sim = util.cos_sim(text_embeddings, img_embeddings)

for text, scores in zip(texts, cos_sim):
    max_img_idx = torch.argmax(scores)
    print("Text:", text)
    print("Score:", scores[max_img_idx] )
    print("Path:", img_paths[max_img_idx], "\n")


Text: A dog in the snow
Score: tensor(0.3132)
Path: https://unsplash.com/photos/QtxgNsmJQSs/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjM1ODQ0MjY3&w=640 

Text: Eine Katze
Score: tensor(0.2674)
Path: https://unsplash.com/photos/9UUoGaaHtNE/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8Mnx8Y2F0fHwwfHx8fDE2MzU4NDI1ODQ&w=640 

Text: Una playa con palmeras.
Score: tensor(0.3055)
Path: https://unsplash.com/photos/Siuwr3uCir0/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8NHx8YmVhY2h8fDB8fHx8MTYzNTg0MjYzMg&w=640 



In [8]:
# Convert the list of embeddings to a numpy array
embeddings = np.array(img_embeddings).astype('float32')

# Instantiate a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Add the embeddings to the index
index.add(embeddings)


In [9]:
text_embeddings.shape

(3, 512)

In [11]:
# Query example
query_embedding = text_embeddings[0]

# Perform a k-nearest neighbor search
k = 2  # Number of nearest neighbors to retrieve
distances, indices = index.search(np.array([query_embedding]), k)

print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)


Indices of nearest neighbors: [[0 2]]
Distances to nearest neighbors: [[124.39336 158.18024]]


In [12]:
from transformers import BlipForConditionalGeneration
captioner = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [18]:
type(img_embeddings)
img_embeddings.shape

(3, 512)

In [21]:
from PIL import Image
import requests
from transformers import AutoProcessor, BlipForConditionalGeneration

processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

KeyboardInterrupt: 

In [22]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "A picture containing following items"

inputs = processor(images=image, text=text, return_tensors="pt")

# outputs = model(**inputs)

In [23]:
op = model.generate(**inputs, max_new_tokens=100)

In [24]:
op

tensor([[30522,  1037,  3861,  4820,  2206,  5167,   102]])

In [25]:
print(processor.decode(op[0], skip_special_tokens=True))

a picture containing following items


In [27]:
from PIL import Image
import requests
from transformers import AutoProcessor, BlipForConditionalGeneration

processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = ""

inputs = processor(images=image, text=text, return_tensors="pt")

# Generate more detailed captions
outputs = model.generate(**inputs, max_length=100, num_beams=5, early_stopping=True)

print(processor.decode(outputs[0], skip_special_tokens=True))

two cats sleeping on a couch


In [19]:
# Generate a caption from the image embedding
caption = captioner.generate(torch.from_numpy(img_embeddings)[0], max_length=100, early_stopping=True)[0].text

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [512]

In [33]:
# pip install llama-index-multi-modal-llms-replicate
!pip install replicate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting replicate
  Downloading replicate-0.25.2-py3-none-any.whl.metadata (24 kB)
Downloading replicate-0.25.2-py3-none-any.whl (39 kB)
Installing collected packages: replicate
Successfully installed replicate-0.25.2


In [2]:
# import os

# REPLICATE_API_TOKEN = "r8_1XfNzsSFaRLgAGlXTMtgzUleN1G9VTH00YQ73"  # Your Relicate API token here
# os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN


In [5]:
import os

REPLICATE_API_TOKEN = "r8_1XfNzsSFaRLgAGlXTMtgzUleN1G9VTH00YQ73"  # Your Relicate API token here
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN

from llama_index.multi_modal_llms.replicate import ReplicateMultiModal
from llama_index.core.schema import ImageDocument
from llama_index.multi_modal_llms.replicate.base import (
    REPLICATE_MULTI_MODAL_LLM_MODELS,
)

multi_modal_llm = ReplicateMultiModal(
    model=REPLICATE_MULTI_MODAL_LLM_MODELS["llava-13b"],
    max_new_tokens=200,
    temperature=0.1,
)
query = "Where is the clutch Lever of bike?"
prompt = f"answer this question in detail: Q:{query}"

llava_response = multi_modal_llm.complete(
    prompt=prompt,
    image_documents=[ImageDocument(image_path='/storage/ashutosh/hackathon/drvvv.png')],
)

In [6]:
llava_response.text

'A: The clutch lever of the bike is located on the left side of the handlebars.'

In [8]:
!pip install llama-index-llms-huggingface

Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.1.4-py3-none-any.whl.metadata (741 bytes)
Collecting huggingface-hub<0.21.0,>=0.20.3 (from llama-index-llms-huggingface)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting accelerate>=0.21.0 (from transformers[torch]<5.0.0,>=4.37.0->llama-index-llms-huggingface)
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Downloading llama_index_llms_huggingface-0.1.4-py3-none-any.whl (7.2 kB)
Downloading huggingface_hub-0.20.3-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.1/330.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: huggingface-hub, accelerate, llama-index-llms-

In [9]:
from transformers import LlamaForCausalLM, LlamaTokenizer
import torch
from llama_index.multi_modal_llms.huggingface import HuggingFaceMultiModalLLM
from llama_index.core.schema import ImageDocument

# Load the pre-trained LLaMA model and tokenizer
model = LlamaForCausalLM.from_pretrained("huggingface/llava-13b")
tokenizer = LlamaTokenizer.from_pretrained("huggingface/llava-13b")

# Create a HuggingFaceMultiModalLLM instance
multi_modal_llm = HuggingFaceMultiModalLLM(model, tokenizer, temperature=0.1, max_new_tokens=200)

query = "Where is the clutch Lever of bike?"
prompt = f"answer this question in detail: Q:{query}"

# Load the image
image = Image.open('/storage/ashutosh/hackathon/drvvv.png')

# Generate the response
llava_response = multi_modal_llm.complete(prompt=prompt, image_documents=[ImageDocument(image=image)])

ModuleNotFoundError: No module named 'llama_index.multi_modal_llms.huggingface'