## Generate EffNetV2 Image embeddings for the train and test dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd

In [4]:
import os
from PIL import Image, ImageFile
from collections import Counter
from tqdm import tqdm

ImageFile.LOAD_TRUNCATED_IMAGES = True  # skip errors for corrupt images

image_directory = "/kaggle/input/ml-challenge-test/train-images"

min_width = float('inf')
min_height = float('inf')
total_width = 0
total_height = 0
count = 0
resolution_counter = Counter()

In [None]:

for filename in tqdm(os.listdir(image_directory)):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        file_path = os.path.join(image_directory, filename)
        try:
            with Image.open(file_path) as img:
                width, height = img.size  # doesn’t load full data

                min_width = min(min_width, width)
                min_height = min(min_height, height)
                total_width += width
                total_height += height
                count += 1
                resolution_counter[(width, height)] += 1

        except Exception:
            pass  # ignore problematic files

if count > 0:
    avg_width = total_width / count
    avg_height = total_height / count
    most_common_res, freq = resolution_counter.most_common(1)[0]
    print(f"Processed {count} images.")
    print(f"Minimum: {min_width}x{min_height}")
    print(f"Average: {avg_width:.2f}x{avg_height:.2f}")
    print(f"Most common: {most_common_res[0]}x{most_common_res[1]} ({freq} times)")

In [5]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
from keras import layers
from keras.applications import EfficientNetV2S
import joblib
from pathlib import Path

2025-10-12 11:47:16.652261: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760269636.855910      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760269636.930117      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
# IMG_SIZE is determined by EfficientNet model choice
IMG_SIZE = 384
BATCH_SIZE = 64

In [7]:
df = pd.read_csv('/kaggle/input/my-dataset/train.csv')
df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [10]:
df["filename"] = df["image_link"].astype(str).str.split("/").str[-1]

In [11]:
df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,filename
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,51mo8htwTHL.jpg
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,71YtriIHAAL.jpg
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,51+PFEe-w-L.jpg
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,41mu0HAToDL.jpg
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,41sA037+QvL.jpg


In [None]:
df.to_csv("/kaggle/working/train2.csv")

In [12]:
df["filename"].nunique()

72288

In [13]:
len(os.listdir("/kaggle/input/ml-challenge-images-train/train-images"))


72287

In [None]:
dftest = pd.read_csv('/kaggle/input/my-dataset/test.csv')
dftest.head()

In [None]:
# Extract just the filename from the image_link column 
dftest["filename"] = dftest["image_link"].apply(lambda x: Path(x).name)

In [None]:
dftest.head()

In [None]:
dftest.to_csv("/kaggle/working/test2.csv")

In [None]:
dftest["filename"].nunique()

In [None]:
# Enable mixed precision (uses float16 where safe, float32 where needed)
# Can give 2-3x speedup on modern GPUs
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Enable memory growth (prevents TF from hogging all GPU memory)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Found {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(e)

In [None]:
filepath = "/kaggle/input/ml-challenge-images-train/train-images/81MPCbA0uSL.jpg"

In [None]:
try:
    # Read image file
    img = tf.io.read_file(filepath)
    
    # Decode JPEG/PNG - TensorFlow uses optimized libraries
    img = tf.io.decode_image(img, channels=3, expand_animations=False)
    
    # Ensure shape is set (needed for some images)
    img.set_shape([None, None, 3])
    
    # Resize - uses optimized C++ implementation
    img = tf.image.resize(img, [self.img_size, self.img_size], 
                         method='bilinear')
    
    # Preprocess for EfficientNetV2
    img = preprocess_input(img)
except Exception as e:
    print(e)

In [17]:
class ImageEmbeddingManager:
    """
    Manages image embeddings with efficient lookup and missing data handling.
    Think of this as a smart dictionary that remembers image features.
    """
    
    def __init__(self, image_folder, embedding_dim=1280, img_size=384):  # EfficientNetV2-S default
        self.image_folder = Path(image_folder)
        self.embedding_dim = embedding_dim
        self.img_size = img_size
        self.missing_files = []
        self.embeddings_cache = {}  # filename -> embedding mapping

    def _load_and_preprocess_image(self, filepath):
        """
        TensorFlow-native image loading - happens on GPU/optimized CPU threads
        
        This function runs in parallel across multiple CPU threads automatically!
        """
        # Read image file
        img = tf.io.read_file(filepath)
        
        # Decode JPEG/PNG - TensorFlow uses optimized libraries
        img = tf.image.decode_image(img, channels=3, expand_animations=False)
        
        # Ensure shape is set (needed for some images)
        img.set_shape([None, None, 3])
        
        # Resize - uses optimized C++ implementation
        img = tf.image.resize(img, [self.img_size, self.img_size], 
                             method='bilinear')
        
        # Preprocess for EfficientNetV2
        img = preprocess_input(img)
        
        return img

    def create_embeddings_lookup_optimized(self, model, filenames_list, batch_size=128):
        """
        GPU-optimized batch processing using tf.data pipeline.
        
        Key optimizations:
        1. Parallel image loading (multiple CPU threads)
        2. Prefetching (prepare next batch while GPU processes current)
        3. Larger batch sizes without memory issues
        4. All operations in TensorFlow graph (minimal Python overhead)
        
        Args:
            model: Your feature_extractor model
            filenames_list: List of all filenames from CSV
            batch_size: Try 64-128 for better GPU utilization
        """
        # Get unique filenames that exist
        unique_filenames = set(filenames_list)
        existing_files = []
        missing_files = []
        corrupted_files = []
        
        for fname in tqdm(unique_filenames):
            filepath = self.image_folder / fname
            if fname == "81MPCbA0uSL.jpg":
                corrupted_files.append(fname)
                continue
            if filepath.exists():
                existing_files.append((str(filepath), fname))
            else:
                self.missing_files.append(fname)
        
        print(f"Found {len(existing_files)} existing images")
        print(f"Missing {len(self.missing_files)} images")
        print(f"Corrupted: {len(corrupted_files)}")
        print(f"Processing with batch_size={batch_size}...")
        
        if not existing_files:
            print("No images to process!")
            return self.embeddings_cache

        filepaths, filenames = zip(*existing_files)
        
        # Create tf.data.Dataset - this is where the magic happens!
        dataset = tf.data.Dataset.from_tensor_slices(list(filepaths))
        
        # Apply transformations
        dataset = dataset.map(
            self._load_and_preprocess_image,
            num_parallel_calls=tf.data.AUTOTUNE  # Automatically uses multiple CPU cores
        )
        
        # Batch the data
        dataset = dataset.batch(batch_size)
        
        # CRITICAL: Prefetch allows CPU to prepare next batch while GPU processes current
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
        # Process batches
        total_batches = (len(existing_files) + batch_size - 1) // batch_size  # Changed from existing_files
        filename_idx = 0
        
        for batch_images in tqdm(dataset, total=total_batches, desc="Extracting embeddings"):
            # This predict() call now gets data instantly from prefetched batches
            batch_embeddings = model.predict(batch_images, verbose=0)
            
            # Store embeddings
            batch_size_actual = len(batch_embeddings)
            for i in range(batch_size_actual):
                fname = filenames[filename_idx + i]  # Changed from filenames to processed_filenames
                self.embeddings_cache[fname] = batch_embeddings[i]
            
            filename_idx += batch_size_actual
        
        self.missing_embedding = np.zeros(self.embedding_dim)
        
        print(f"Created embedding cache with {len(self.embeddings_cache)} entries")
        return self.embeddings_cache
        
    
    def get_embeddings_for_dataframe(self, df, filename_column='filename'):
        """
        Maps embeddings to all rows in dataframe (75K rows).
        Handles duplicates and missing images automatically.
        
        Args:
            df: DataFrame with filename column
            filename_column: Name of column containing image filenames
            
        Returns:
            np.array: Shape (75000, embedding_dim)
        """
        embeddings_list = []
        missing_count = 0
        
        for fname in tqdm(df[filename_column]):
            if fname in self.embeddings_cache:
                embeddings_list.append(self.embeddings_cache[fname])
            else:
                self.embeddings_cache[fname] = np.zeros(self.embedding_dim)
                embeddings_list.append(self.embeddings_cache[fname])
                missing_count += 1
        
        print(f"Mapped embeddings for {len(df)} rows")
        print(f"Used fallback embedding for {missing_count} missing images")
        
        return np.array(embeddings_list)
    
    def save_cache(self, filepath):
        """Save the embedding cache to disk to avoid recomputation"""
        joblib.dump({
            'embeddings_cache': self.embeddings_cache,
            'missing_embedding': self.missing_embedding,
            'embedding_dim': self.embedding_dim
        }, filepath)
        print(f"Saved embedding cache to {filepath}")
    
    def load_cache(self, filepath):
        """Load pre-computed embeddings from disk"""
        data = joblib.load(filepath)
        self.embeddings_cache = data['embeddings_cache']
        self.missing_embedding = data['missing_embedding']
        self.embedding_dim = data['embedding_dim']
        print(f"Loaded {len(self.embeddings_cache)} embeddings from cache")

In [15]:
model = EfficientNetV2S(
    include_top=False,
    weights="imagenet",
    pooling='avg',  # Global average pooling - gives you (1280,) vector
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

I0000 00:00:1760269708.907013      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-s_notop.h5
[1m82420632/82420632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [18]:
from tensorflow.keras.applications.efficientnet_v2 import preprocess_input
# Feature extractor - use the proper preprocessing function
feature_extractor = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3)),
    model
])

# The preprocess_input function applies the exact normalization that EfficientNetV2 was trained with.

# Make it faster - model is only used for inference
feature_extractor.trainable = False

In [19]:
train_manager = ImageEmbeddingManager('/kaggle/input/ml-challenge-images-train/train-images', embedding_dim=1280)

train_manager.create_embeddings_lookup_optimized(feature_extractor, df['filename'].tolist(), batch_size=128)

100%|██████████| 72288/72288 [03:00<00:00, 401.20it/s]


Found 72286 existing images
Missing 1 images
Corrupted: 1
Processing with batch_size=128...


I0000 00:00:1760270090.167606     108 service.cc:148] XLA service 0x7f660c0026e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1760270090.168488     108 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1760270091.650992     108 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1760270098.623408     108 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
Extracting embeddings: 100%|██████████| 565/565 [13:21<00:00,  1.42s/it]

Created embedding cache with 72286 entries





{'31gj3PmRmoL.jpg': array([-0.16947614,  0.013225  , -0.04282083, ...,  0.01433972,
        -0.1941538 , -0.06099581], dtype=float32),
 '81SzyUAq1gL.jpg': array([-0.17484415,  0.11200497, -0.14287287, ..., -0.15824504,
        -0.0798252 , -0.13388541], dtype=float32),
 '71CMX0XHTYL.jpg': array([-0.1435968 ,  1.1856906 ,  0.129645  , ...,  0.02342304,
         0.02755574, -0.07851438], dtype=float32),
 '71JFQixFn3L.jpg': array([-0.18728991,  0.16267414, -0.07250943, ..., -0.00664058,
        -0.09839652,  0.8768952 ], dtype=float32),
 '81c1Ud8aS1L.jpg': array([-0.18368657,  0.44812754, -0.21599586, ...,  0.19565964,
        -0.18105614,  0.12817863], dtype=float32),
 '81kH8AbxfbL.jpg': array([-0.2027706 ,  0.03727663,  0.02640452, ..., -0.04369027,
        -0.02689905, -0.20531556], dtype=float32),
 '71maP5bDFdL.jpg': array([-0.19389437,  0.26972747, -0.06659035, ..., -0.21313554,
        -0.20656659, -0.20737174], dtype=float32),
 '61OE+-SFsVL.jpg': array([ 0.09849419,  0.30597234,  0

In [20]:
train_image_embeddings = train_manager.get_embeddings_for_dataframe(df)

100%|██████████| 75000/75000 [00:00<00:00, 1416184.44it/s]


Mapped embeddings for 75000 rows
Used fallback embedding for 2 missing images


In [21]:
train_image_embeddings.shape

(75000, 1280)

In [22]:
type(train_image_embeddings)

numpy.ndarray

In [24]:
# Save the array to a .npy file
np.save('/kaggle/working/train_embeddings_efficientnetv2s.npy', train_image_embeddings)

In [26]:
train_manager.save_cache('/kaggle/working/train_embeddings_dict_effnetv2.joblib')

Saved embedding cache to /kaggle/working/train_embeddings_dict_effnetv2.joblib
