In [1]:
!pip install usearch

Collecting usearch
  Downloading usearch-2.15.3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (32 kB)
Collecting simsimd>=5.6.4 (from usearch)
  Downloading simsimd-5.6.4-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/51.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading usearch-2.15.3-cp310-cp310-manylinux_2_28_x86_64.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading simsimd-5.6.4-cp310-cp310-manylinux_2_28_x86_64.whl (498 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m498.9/498.9 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simsimd, usearch
Successfully installed simsimd-5.6.4 usearch-2.15.3


In [2]:
!pip install -U sentence-transformers
!pip install -q pyvi

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [3]:
# Copy image_metadata.csv
!cp /kaggle/input/vietocr-embedding2/image_metadata_text.csv /kaggle/working/

# Copy vector_database.usearch
!cp /kaggle/input/vietocr-embedding2/vector_database_text.usearch /kaggle/working/

In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from usearch.index import Index
from tqdm import tqdm  # Import tqdm for progress bar
import os

# Paths for existing files
metadata_file = "/kaggle/working/image_metadata_text.csv"
index_file = "/kaggle/working/vector_database_text.usearch"

# Load the CSV with text data (new data to be processed)
csv_file = "/kaggle/input/vietnam-textdetect-ocr/ocr_results.csv"  # Update with your actual file path
df = pd.read_csv(csv_file)
print(f"Loaded dataframe with shape: {df.shape}")

# Assuming the CSV has columns: image_path, text_0, text_1, ..., text_19
image_paths = df['image_path'].values
text_columns = [f'text_{i}' for i in range(20)]  # List of text columns from text_0 to text_19

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
dimension = model.get_sentence_embedding_dimension()  # Retrieve dimension from the model

# Initialize or load the usearch index
index = Index(ndim=dimension, dtype=np.float16)

if os.path.exists(index_file):
    # Load existing usearch index
    index.load(index_file)
    print(f"Loaded existing usearch index from {index_file}.")
else:
    print("No existing index found, creating a new one.")

# Load or initialize metadata
if os.path.exists(metadata_file):
    # Load existing metadata and adjust index counter
    metadata_df = pd.read_csv(metadata_file)
    metadata = metadata_df.to_dict('records')
    idx_counter = metadata_df['index'].max() + 1  # Continue from the last index
    print(f"Loaded existing metadata with shape: {metadata_df.shape}. Continuing from index {idx_counter}.")
else:
    metadata = []
    idx_counter = 0  # Start from 0 if no metadata file exists
    print("No existing metadata found, starting fresh.")

# Prepare for batching
batch_size = 6000  # Set batch size according to your memory constraints

def process_batch(texts, image_paths, idx_counter, metadata, index):
    # Generate embeddings for the batch
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=False)

    # Generate unique keys for each embedding in this batch
    keys = np.arange(idx_counter, idx_counter + len(embeddings))

    # Add embeddings to the index
    index.add(keys=keys, vectors=embeddings)

    # Prepare metadata for this batch
    for idx, image_path in enumerate(image_paths):
        metadata.append({"index": idx_counter + idx, "image_path": image_path})

    # Update the index counter
    return idx_counter + len(embeddings)

# Iterate over each row in the dataframe using tqdm for progress bar
for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    # Find the first null value in the text columns
    valid_texts = []
    for col in text_columns:
        if pd.isnull(row[col]):
            break
        valid_texts.append(row[col])

    # Skip if there are no valid texts
    if not valid_texts:
        continue

    # Process the valid texts in batches
    for start in range(0, len(valid_texts), batch_size):
        end = start + batch_size
        idx_counter = process_batch(
            valid_texts[start:end], [image_paths[i]] * len(valid_texts[start:end]), idx_counter, metadata, index
        )

# Save the updated usearch index to disk
index.save(index_file)
print(f"Usearch index updated and saved to {index_file}.")

# Convert metadata to DataFrame and save to CSV
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv(metadata_file, index=False)
print(f"Metadata updated and saved to {metadata_file}.")

print("Process completed.")

  from tqdm.autonotebook import tqdm, trange


Loaded dataframe with shape: (110733, 21)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded existing usearch index from /kaggle/working/vector_database_text.usearch.
Loaded existing metadata with shape: (2348394, 2). Continuing from index 2348394.


Processing rows: 100%|██████████| 110733/110733 [17:34<00:00, 105.02it/s]


Usearch index updated and saved to /kaggle/working/vector_database_text.usearch.
Metadata updated and saved to /kaggle/working/image_metadata_text.csv.
Process completed.


In [6]:
metadata_df.shape

(2763316, 2)

In [7]:
# Reload the index from disk
loaded_index = Index(ndim=dimension, dtype=np.float16)
loaded_index.load("vector_database_text.usearch")

# Print the shape of the loaded index
print(f"Shape of the vector database: {loaded_index.size} entries, {loaded_index.ndim} dimensions")

Shape of the vector database: 2763316 entries, 384 dimensions


In [8]:
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the metadata CSV
metadata_df = pd.read_csv("image_metadata_text.csv")

# Load the usearch index from disk
dimension = 384  # Example dimension; adjust according to your embedding model
index = Index(ndim=dimension, dtype=np.float16)
index.load("vector_database_text.usearch")

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to plot the retrieved images with 4 images per row
def plot_results(results, top_k=20, images_per_row=4):
    num_rows = (top_k + images_per_row - 1) // images_per_row  # Calculate the number of rows needed
    plt.figure(figsize=(15, num_rows * 5))  # Adjust the figure size based on the number of rows

    for i, (_, row) in enumerate(results.iterrows()):
        if i >= top_k:
            break  # Stop if we've reached the top_k limit
        image_path = row["image_path"]
        image = Image.open(image_path)
        plt.subplot(num_rows, images_per_row, i + 1)
        plt.imshow(image)
        plt.axis("off")
        plt.title(f"Rank {i+1}")
    
    plt.show()

# Function to extract text features using SentenceTransformer
def extract_text_features(text):
    # Encode the text using the SentenceTransformer model
    features_text = model.encode([text])
    return features_text

# Function to query the usearch index using text features
def query_with_text(prompt, top_k=20):
    text_features = extract_text_features(prompt)
    text_features = text_features.astype(np.float16)  # Ensure the features are in float16 format
    matches = index.search(text_features, top_k)
    top_results = metadata_df.iloc[matches.keys.flatten()]  # Flatten in case keys are nested
    return top_results




In [9]:
# Example usage
prompt = "trường học"
top_results = query_with_text(prompt, top_k=20)
# plot_results(top_results, top_k=20)
top_results

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,index,image_path
2176695,2176695,/kaggle/input/aic-frames/output/L22_V006.33887...
2250808,2250808,/kaggle/input/aic-frames/output/L22_V027.18514...
2107735,2107735,/kaggle/input/aic-frames/output/L21_V016.3812.jpg
1622669,1622669,/kaggle/input/aic-frames/output/L16_V021.5074.jpg
1442215,1442215,/kaggle/input/aic-frames/output/L14_V026.19131...
1441493,1441493,/kaggle/input/aic-frames/output/L14_V026.12344...
1439348,1439348,/kaggle/input/aic-frames/output/L14_V025.22624...
1184964,1184964,/kaggle/input/aic-frames/output/L12_V013.21949...
1184861,1184861,/kaggle/input/aic-frames/output/L12_V013.21166...
217965,217965,/kaggle/input/aic-frames/output/L03_V009.9445.jpg
