<a href="https://colab.research.google.com/github/dmnk1308/DubAir/blob/main/faiss_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
%pip install image_embeddings



In [8]:
import image_embeddings
import pandas as pd
from tqdm import tqdm

# Changes source code image_embeddings

In [2]:
import pyarrow.parquet as pq

from dataclasses import dataclass
from IPython.display import Image, display
from ipywidgets import widgets, HBox, VBox
import faiss
import numpy as np
import random
import json
from pathlib import Path


def read_embeddings(path):
    emb = pq.read_table(path).to_pandas()
    id_to_name = {k: v.decode("utf-8") for k, v in enumerate(list(emb["image_name"]))}
    name_to_id = {v: k for k, v in id_to_name.items()}
    embgood = np.stack(emb["embedding"].to_numpy())
    return [id_to_name, name_to_id, embgood]


def embeddings_to_numpy(input_path, output_path):
    emb = pq.read_table(input_path).to_pandas()

    Path(output_path).mkdir(parents=True, exist_ok=True)
    id_name = [{"id": k, "name": v.decode("utf-8")} for k, v in enumerate(list(emb["image_name"]))]
    json.dump(id_name, open(output_path + "/id_name.json", "w"))

    emb = np.stack(emb["embedding"].to_numpy())
    np.save(open(output_path + "/embedding.npy", "wb"), emb)


def build_index(emb):
    d = emb.shape[1]
    xb = emb
    index = faiss.IndexFlatIP(d)
    index.add(xb)
    return index


def random_search(path):
    [id_to_name, name_to_id, embeddings] = read_embeddings(path)
    index = build_index(embeddings)
    p = random.randint(0, len(id_to_name) - 1)
    print(id_to_name[p])
    results = search(index, id_to_name, embeddings[p])
    for e in results:
        print(f"{e[0]:.2f} {e[1]}")


def search(index, id_to_name, emb, k=5):
    D, I = index.search(np.expand_dims(emb, 0), k)  # actual search
    return list(zip(D[0], [id_to_name[x] for x in I[0]]))


def display_picture(image_path, image_name):
    display(Image(filename=f"{image_path}/{image_name}.png"))


def display_results(image_path, results):
    hbox = HBox(
        [
            VBox(
                [
                    widgets.Label(f"{distance:.2f} {image_name}"),
                    widgets.Image(value=open(f"{image_path}/{image_name}.png", "rb").read()),
                ]
            )
            for distance, image_name in results
        ]
    )
    display(hbox)

In [None]:
# Let's define some paths where to save images, tfrecords and embeddings
from pathlib import Path
home = str(Path.home())
dataset = "tf_flowers"
path_images = f"{home}/{dataset}/images"
path_tfrecords = f"{home}/{dataset}/tfrecords"
path_embeddings = f"{home}/{dataset}/embeddings"

In [None]:
image_embeddings.downloader.save_examples_to_folder(output_folder=path_images, images_count=1000, dataset=dataset)

# Get index

In [3]:
path_embeddings = "/content/drive/MyDrive/DubAir/img_embeddings/tf_airbnb/embeddings"
path_images = "/content/drive/Othercomputers/Mein MacBook Air/images_resized"

In [4]:
[id_to_name, name_to_id, embeddings] = read_embeddings(path_embeddings)
index = build_index(embeddings)

In [5]:
p=2
print(id_to_name[p])
#display_picture(path_images, id_to_name[p])
results = search(index, id_to_name, embeddings[p], k = len(embeddings))

10128292_3


In [6]:
p=0
results = search(index, id_to_name, embeddings[p], k = len(embeddings))
df = pd.DataFrame(results, columns = [p, "id"])
df.index = df["id"]
df = df.drop("id", axis = 1)

for i in tqdm(range(1,len(embeddings))):
  p = i
  results = search(index, id_to_name, embeddings[p], k = len(embeddings))
  df_tmp = pd.DataFrame(results, columns = [p, "id"])
  df_tmp.index = df_tmp["id"]
  df_tmp = df_tmp.drop("id", axis = 1)
  pd.concat([df,df_tmp], axis = 1)

  1%|          | 158/15525 [00:22<37:08,  6.90it/s]


KeyboardInterrupt: ignored