In [1]:
from transformers import CLIPProcessor, CLIPModel

# load pre-trained model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
# load preprocessor for model input
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

In [2]:
import requests
from PIL import Image
from pathlib import Path
import torch

def download_and_open_image(url, save_path):
    sample_path = Path(save_path)
    sample_path.parent.mkdir(parents=True, exist_ok=True)
    r = requests.get(url)

    with sample_path.open("wb") as f:
        f.write(r.content)

    image = Image.open(sample_path)
    return image

In [3]:
image_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_tulips.jpg"
save_path = "data/coco_tulips.jpg"
image = download_and_open_image(image_url, save_path)

In [4]:
import torch

class ImageFeatureExtractor(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, pixel_values):
        return self.model.get_image_features(pixel_values)
    
class TextFeatureExtractor(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model.get_text_features(input_ids)


In [5]:
image_inputs = processor(
        images = image,
        return_tensors="pt"
    )

text_inputs = processor(
    text = "dog, black, in hollywood",
    return_tensors="pt"
)

import openvino as ov
from pathlib import Path
core = ov.Core()

image_fp16_model_path = Path("image_clip-vit-base-patch16.xml")
model.config.torchscript = True

if not image_fp16_model_path.exists():
    ov_model = ov.convert_model(ImageFeatureExtractor(model), example_input=dict(image_inputs))
    ov.save_model(ov_model, image_fp16_model_path)
compiled_image_model = core.compile_model(image_fp16_model_path, 'AUTO')

text_fp16_model_path = Path("text_clip-vit-base-patch16.xml")
model.config.torchscript = True

if not text_fp16_model_path.exists():
    ov_model = ov.convert_model(TextFeatureExtractor(model), example_input=dict(text_inputs))
    ov.save_model(ov_model, text_fp16_model_path)
compiled_text_model = core.compile_model(text_fp16_model_path, 'AUTO')



# Prepare the image input
image_input = processor(images=image, return_tensors="pt")["pixel_values"]

# Run inference
ov_output = compiled_image_model(image_input)
image_features = ov_output[compiled_image_model.output(0)]
print(image_features)


def get_single_image_embedding(image):
    # Get single image embeddings
    inputs = processor(
        images=image,
        return_tensors="pt"
    )
    image_input = inputs["pixel_values"]
    ov_output = compiled_image_model(image_input)
    image_features = ov_output[compiled_image_model.output(0)]
    return image_features

def get_single_text_embedding(text):
    inputs = processor(text=text, return_tensors="pt")
    text_input = inputs["input_ids"]
    ov_output = compiled_text_model(text_input)
    text_features = ov_output[compiled_text_model.output(0)]
    return text_features

[[-2.19914958e-01 -1.43779957e+00  1.68696702e-01  2.35982444e-02
  -2.01770574e-01  4.06838283e-02 -1.22514576e-01  7.25643277e-01
  -3.46505731e-01  2.71688737e-02 -4.83018875e-01 -2.75728166e-01
   3.31674218e-01 -1.37439787e-01 -2.81778395e-01  7.80049013e-03
  -1.68826565e-01  5.02239466e-01  2.29350448e-01 -2.24074557e-01
  -3.43964919e-02 -4.62107480e-01  3.59208375e-01  2.20400229e-01
  -3.97568852e-01 -2.95854509e-01 -6.45035267e-01  1.78328961e-01
   5.03996491e-01 -7.80545101e-02 -5.68075553e-02  2.93036699e-01
  -2.38245726e-01 -6.27012908e-01 -2.56510675e-01  2.13830665e-01
   1.87109753e-01  1.52856886e-01  3.46124470e-01 -8.24485123e-01
   6.21066839e-02 -4.10600126e-01 -2.53095329e-01 -1.94112286e-01
  -3.52725983e-01  2.43654668e-01  3.13615471e-01 -8.13287348e-02
   1.29491597e-01 -1.17533110e-01  2.05151394e-01  1.52452737e-01
   2.40767673e-01  2.24273264e-01  4.39472407e-01 -2.97078528e-02
   5.37916601e-01  5.04969656e-01 -1.48654059e-01  4.52014953e-01
   9.18075

In [8]:
from pathlib import Path
import lancedb
db = lancedb.connect("./.lancedb")
embedding = get_single_image_embedding(image)
text_embedding = get_single_text_embedding("japanese writing")
tbl = db.open_table("pt_table")

a= tbl.search(query=text_embedding.tolist()[0]).limit(4).to_list()


In [10]:
a

[{'vector': [0.075439453125,
   -0.34814453125,
   0.33447265625,
   0.111083984375,
   0.135986328125,
   -0.41162109375,
   0.105224609375,
   -0.2578125,
   -0.0098876953125,
   -0.487060546875,
   -0.08648681640625,
   -0.0848388671875,
   -0.1026611328125,
   -0.1944580078125,
   0.1551513671875,
   -0.06396484375,
   0.17822265625,
   -0.045257568359375,
   0.1973876953125,
   -0.06005859375,
   -0.294677734375,
   -0.28515625,
   0.0831298828125,
   0.11029052734375,
   0.1378173828125,
   0.06390380859375,
   -0.071044921875,
   0.12310791015625,
   -0.04010009765625,
   -0.28076171875,
   0.15234375,
   0.1248779296875,
   0.0335693359375,
   0.0355224609375,
   -0.1485595703125,
   0.0088043212890625,
   -0.12646484375,
   -0.05340576171875,
   -0.165283203125,
   0.09259033203125,
   -0.1339111328125,
   -0.160888671875,
   -0.28125,
   -0.046844482421875,
   0.357666015625,
   0.03326416015625,
   -0.03582763671875,
   -0.477783203125,
   0.1590576171875,
   0.230712890625,