# Creation of the Vector Database to store MtG Card Images

We will be using Pinecone to create a vector database that stores the MtG card image embeddings.

In [10]:
import os
import numpy as np
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import cv2
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.efficientnet import preprocess_input

# Load the EfficientNetB0 model pre-trained on ImageNet, excluding the top layers
base_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')

# Pinecone Initialization (new method)
api_key = os.getenv("PINECONE_API_KEY")  # Get your API key from environment variables
pinecone_client = pinecone.Pinecone(api_key=api_key)

# Check existing indexes
indexes = pinecone_client.list_indexes().names()  # Returns a list of existing index names
print(f"Existing indexes: {indexes}")

# Connect to a Pinecone index
index_name = 'mtg-cards-index-efficientnet'
if index_name not in indexes:
    print(f"Creating index: {index_name}")
    pinecone_client.create_index(
        name=index_name, 
        dimension=1280,  # Dimension matches the EfficientNetB0 output
        metric='cosine', 
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1',
        )
    )
else:
    print(f"Index {index_name} already exists. Skipping creation.")
    
index = pinecone_client.Index(index_name)

def resize_with_padding(img, target_size):
    h, w, _ = img.shape
    scale = min(target_size / h, target_size / w)
    new_w = int(w * scale)
    new_h = int(h * scale)
    resized_img = cv2.resize(img, (new_w, new_h))
    padded_img = np.full((target_size, target_size, 3), 255, dtype=np.uint8)
    pad_w = (target_size - new_w) // 2
    pad_h = (target_size - new_h) // 2
    padded_img[pad_h:pad_h + new_h, pad_w:pad_w + new_w, :] = resized_img
    return padded_img

def preprocess_image_with_padding(img_path, target_size=(224, 224)):
    img = cv2.imread(img_path)
    padded_img = resize_with_padding(img, target_size[0])
    padded_img = image.img_to_array(padded_img)
    padded_img = np.expand_dims(padded_img, axis=0)
    padded_img = preprocess_input(padded_img)
    return padded_img

def embed_image(img_path):
    img_array = preprocess_image_with_padding(img_path)
    embedding = base_model.predict(img_array)
    return embedding.flatten()

def batch_upsert_embeddings(image_dir, batch_size=100, namespace="cards"):
    batch = []
    for img_name in os.listdir(image_dir):
        img_path = os.path.join(image_dir, img_name)
        try:
            # Generate the embedding
            embedding = embed_image(img_path)
            # Add the vector to the batch with the image name as the ID
            batch.append({"id": img_name, "values": embedding.tolist()})
            # Upsert the batch when it reaches the specified size
            if len(batch) >= batch_size:
                index.upsert(vectors=batch, namespace=namespace)
                print(f"Upserted batch of {batch_size} vectors.")
                batch.clear()  # Clear the batch
        except Exception as e:
            print(f"Error processing image {img_name}: {e}")
    
    # Upsert any remaining vectors in the last batch
    if batch:
        index.upsert(vectors=batch, namespace=namespace)
        print(f"Upserted final batch of {len(batch)} vectors.")



Existing indexes: ['mtg-cards-index-efficientnet']
Index mtg-cards-index-efficientnet already exists. Skipping creation.


In [13]:
# Directory where images are saved
image_dir = "Datasets/mtg_images"

# Run the batch upsert
batch_upsert_embeddings(image_dir, batch_size=100, namespace="mtg_cards")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41

KeyboardInterrupt: 

In [14]:
stats = index.describe_index_stats()
print(stats)

{'dimension': 1280,
 'index_fullness': 0.0,
 'namespaces': {'mtg_cards': {'vector_count': 95486}},
 'total_vector_count': 95486}


## Testing a picture from my phone and IT WORKS

The top 2 results are the card I uploaded

In [17]:
test = embed_image('Datasets/mtg_test_images/shuko.jpg')

# Convert the NumPy array to a list
test_list = test.tolist()

query_results = index.query(
    namespace="mtg_cards",
    vector=test_list,  # Pass the list instead of the NumPy array
    top_k=3
)

print(query_results)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
{'matches': [{'id': 'a47456b8-cef8-4085-90b1-92788e16fd27.jpg',
              'score': 0.930829465,
              'values': []},
             {'id': 'a88ed77e-5eae-4282-8f19-75dc8437f83d.jpg',
              'score': 0.919052958,
              'values': []},
             {'id': '011bc5b7-c4d5-4c4c-af0d-aa0853d63f3a.jpg',
              'score': 0.815545797,
              'values': []}],
 'namespace': 'mtg_cards',
 'usage': {'read_units': 6}}
