In [1]:
from PIL import Image
from matplotlib import pyplot as plt
import numpy as np
from lang_sam import LangSAM
import torch
import os
import cv2
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model and processor
embeddingModel = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch16")
embeddingProcessor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")

In [3]:
class ObjectDetector:
    def __init__(self):
        self.model = LangSAM(sam_type="sam2.1_hiera_tiny", gdino_type="tiny")
    
    def predict(self, image_pil, text_prompt):
        results = self.model.predict([image_pil], [text_prompt])
        return results
    
    def plot_results(self, image_pil, results, text_prompt):
        
        # Convert the mask to a numpy array
        mask = results[0]['masks'][0]

        # Plot the image
        plt.figure(figsize=(10, 10))
        plt.imshow(image_pil)

        # Plot the bounding box
        box = results[0]['boxes'][0]
        plt.gca().add_patch(plt.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1], edgecolor='red', facecolor='none', linewidth=2))

        # Plot the mask
        plt.imshow(np.ma.masked_where(mask == 0, mask), alpha=0.5, cmap='jet')

        plt.title(f"Prediction for: {text_prompt}")
        plt.axis('off')
        plt.show()

    # given an image and results, hide the mask on the image by making the values zero within the mask
    def hide_mask(self, image_pil, results):
        mask = results[0]['masks'][0]
        image_np = np.array(image_pil)
        image_np[mask == 1] = 0
        return Image.fromarray(image_np)

    def crop_image(self, image_pil, results):
        box = results[0]['boxes'][0]
        cropped_image = image_pil.crop((box[0], box[1], box[2], box[3]))
        return cropped_image
    
    def predict_and_crop_image(self, image_pil, text_prompt):
        results = self.predict(image_pil, text_prompt)
        cropped_image = self.crop_image(image_pil, results)
        return cropped_image

    def show_image(self, image_pil):
        plt.imshow(image_pil)
        plt.axis('off')
        plt.show()

    def resize(self, image_pil, long_side):
        width, height = image_pil.size
        if width > height:
            new_width = long_side
            new_height = int(long_side * height / width)
        else:
            new_height = long_side
            new_width = int(long_side * width / height)
        return image_pil.resize((new_width, new_height))
object_detector = ObjectDetector()

In [4]:
def extract_frames(video_path, frame_rate=2):
    video_capture = cv2.VideoCapture(video_path)
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video_capture.get(cv2.CAP_PROP_FPS)
    interval = int(fps / frame_rate)
    frame_number = 0
    frames = []
    while True:
        success, frame = video_capture.read()
        if not success:
            break
        if frame_number % interval == 0:
            frames.append(frame)

        frame_number += 1
    video_capture.release()
    return frames

In [None]:
offsetImagePath = "./offset1.jpg"
offsetFeatures = []
inputs = processor(images=Image.open(offsetImagePath), return_tensors="pt", padding=True)
with torch.no_grad():
    offsetFeatures = model(**inputs)
# print(offsetFeatures)
offsetFeatures = offsetFeatures

In [5]:
def toFilter(path, maxNumber= 1999):
    try:
        number = path.split('_')[0]
        return int(number) <= maxNumber
    except:
        return False

In [None]:
embeddings = {}
def load_embeddings():
    path_to_videos = "./sourceDataVideos/"
    videos = os.listdir(path_to_videos)
    # videos = videos[:2]
    print(videos)
    count = 0
    for video in videos:
        if(not toFilter(video, 1050)):
            continue
        print("--------------------- Processing video: ", count, video)
        count += 1
        video_path = os.path.join(path_to_videos, video)
        frames = extract_frames(video_path, 1)
        print("Number of frames: ", len(frames))
        id = video.split("_")[0]
        frame_number = 0
        for frame in frames:
            print("Processing frame: ", frame_number)
            image = Image.fromarray(frame)
            inputs = embeddingProcessor(images=object_detector.predict_and_crop_image(image, "object, not hand, not background"), return_tensors="pt", padding=True)
            with torch.no_grad():
                features = embeddingModel(**inputs)
                features.image_embeds = features.image_embeds
            embeddings[id + "_" + str(frame_number)] = features
            frame_number += 1
load_embeddings()

In [17]:
video_frame_map = {}
def load_frame_map():
    path_to_videos = "./sourceDataVideos/"
    videos = os.listdir(path_to_videos)
    # videos = videos[:2]
    print(videos)
    count = 0
    for video in videos:
        if(not toFilter(video, 1050)):
            continue
        print("--------------------- Processing video: ", count, video)
        count += 1
        video_path = os.path.join(path_to_videos, video)
        frames = extract_frames(video_path, 1)
        print("Number of frames: ", len(frames))
        id = video.split("_")[0]
        frame_number = 0
        for frame in frames:
            print("Processing frame: ", frame_number)
            # image = Image.fromarray(frame)
            # inputs = embeddingProcessor(images=object_detector.predict_and_crop_image(image, "object, not hand, not background"), return_tensors="pt", padding=True)
            # with torch.no_grad():
            #     features = embeddingModel(**inputs)
            #     features.image_embeds = features.image_embeds
            # embeddings[id + "_" + str(frame_number)] = features
            video_frame_map[id + "_" + str(frame_number)] = Image.fromarray(frame)
            frame_number += 1
load_frame_map()

['1022_Lux-Soap_30.mp4', '1034_Tide-Surf_45.mp4', '1009_Wheel-Soap_10.mp4', '1046_Parachute-Oil_104.mp4', '1051_CorrianderPowder-Spices_5.mp4', '1027_Rin-Surf_10.mp4', '1054_Jeera-Spices_25.mp4', '1069_SmallRava1kg-Grains_40.mp4', '1082_GajrajRice500g-Grains_28.mp4', '1048_Maggi-Noodles_5.mp4', '1017_KrackJack-Biscuit_5.mp4', '1001_Dove-Soap_35.mp4', '1018_Glucose-Biscuit_10.mp4', '1016_Oreo-Biscuit_10.mp4', '1052_SambarPowder-Spices_10.mp4', '1005_Santoor-Soap_40.mp4', '1059_TurmericPowder-Spices_5.mp4', '.DS_Store', '1007_XXX-Soap_10.mp4', '1032_Ariel-Surf_10.mp4', '1066_GroundNut1kg-Grains_140.mp4', '1062_ChickenMasala-Spices_5.mp4', '1049_MustardSeeds-Spices_5.mp4', '1076_GroundNut500g-Grains_70.mp4', '1047_Yippee-Noodles_14.mp4', '1039_Wheel-Surf_38.mp4', '1015_AllRounder-Biscuit_5.mp4', '1058_Efkay-Tea_10.mp4', '1008_Diamond-Soap_10.mp4', '1055_GodavariGhee-Milk_10.mp4', '1050_Kumkum-Temple_5.mp4', '1079_SmallRava500g-Grains_20.mp4', '1040_XXX-Surf_38.mp4', '1014_ButterDelite-Bis

In [7]:
products = {}
def load_products():
    # Read the CSV file
    df = pd.read_csv('output.csv')

    # Create the dictionary
    for _, row in df.iterrows():
        product_id = str(row['id'])
        products[product_id] = {
            'id': product_id,
            'nickname': row['nickname'],
            'price': row['price']
        }
load_products()

In [None]:
images = {}
image_map = {}
image_cropped_map = {}
def load_images():
    path_to_images = "./sourceDataImages/"
    imageDirs = os.listdir(path_to_images)
    print(imageDirs)
    imageDirCount = 0
    for imageDir in imageDirs:
        print("Processing imageDir: ", imageDirCount)
        try:
            if(not toFilter(imageDir, 1050)):
                continue
            imagePaths = os.listdir(os.path.join(path_to_images, imageDir))
            count = 0
            print("Processing imageDir: ", imageDir)
            for imagePath in imagePaths:
                image = Image.open(os.path.join(path_to_images, imageDir, imagePath))
                cropped_image = object_detector.predict_and_crop_image(image, "object, not hand, not background")
                inputs = embeddingProcessor(images=cropped_image, return_tensors="pt", padding=True)
                with torch.no_grad():
                    features = embeddingModel(**inputs)
                    features.image_embeds = features.image_embeds
                images[imageDir + "_" + str(count) ] = features
                image_map[imageDir + "_" + str(count) ] = image
                image_cropped_map[imageDir + "_" + str(count) ] = cropped_image
                count += 1
        except:
            print("Error processing imageDir: ", imageDir)
        imageDirCount += 1
        
load_images()

['1069', '1056', '1051', '1058', '1067', '1060', '1034', '1033', '1005', '1002', '1061', '1059', '1066', '1050', '1068', '1057', '1003', '1032', '1035', '.DS_Store', '1028', '1017', '1010', '1019', '1026', '1021', '1081', '1075', '1072', '1044', '1043', '1020', '1018', '1027', '1011', '1029', '1016', '1042', '1045', '1073', '1074', '1080', '1030', '1037', '1008', '1001', '1006', '1039', '1052', '1055', '1063', '1064', '1007', '1038', '1036', '1009', '1031', '1065', '1062', '1054', '1053', '1071', '1082', '1076', '1049', '1040', '1047', '1078', '1013', '1014', '1022', '1025', '1046', '1079', '1041', '1077', '1083', '1048', '1070', '1024', '1023', '1015', '1012']
Processing imageDir:  1069
Processing imageDir:  1056
Processing imageDir:  1051
Processing imageDir:  1058
Processing imageDir:  1067
Processing imageDir:  1060
Processing imageDir:  1034
Processing imageDir:  1034
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1033
Processing imageDir:  1033
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1005
Processing imageDir:  1005
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1002
Processing imageDir:  1002
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1061
Processing imageDir:  1059
Processing imageDir:  1066
Processing imageDir:  1050
Processing imageDir:  1050
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1068
Processing imageDir:  1057
Processing imageDir:  1003
Processing imageDir:  1003
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1032
Processing imageDir:  1032
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1035
Processing imageDir:  1035
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  .DS_Store
Processing imageDir:  1028
Processing imageDir:  1028
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1017
Processing imageDir:  1017
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1010
Processing imageDir:  1010
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1019
Processing imageDir:  1019
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1026
Processing imageDir:  1026
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1021
Processing imageDir:  1021
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1081
Processing imageDir:  1075
Processing imageDir:  1072
Processing imageDir:  1044
Processing imageDir:  1044
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1043
Processing imageDir:  1043
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1020
Processing imageDir:  1020
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1018
Processing imageDir:  1018
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1027
Processing imageDir:  1027
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1011
Processing imageDir:  1011
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1029
Processing imageDir:  1029
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1016
Processing imageDir:  1016
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1042
Processing imageDir:  1042
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1045
Processing imageDir:  1045
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1073
Processing imageDir:  1074
Processing imageDir:  1080
Processing imageDir:  1030
Processing imageDir:  1030
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1037
Processing imageDir:  1037
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1008
Processing imageDir:  1008
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1001
Processing imageDir:  1001
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1006
Processing imageDir:  1006
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1039
Processing imageDir:  1039
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1052
Processing imageDir:  1055
Processing imageDir:  1063
Processing imageDir:  1064
Processing imageDir:  1007
Processing imageDir:  1007
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1038
Processing imageDir:  1038
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1036
Processing imageDir:  1036
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1009
Processing imageDir:  1009
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1031
Processing imageDir:  1031
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1065
Processing imageDir:  1062
Processing imageDir:  1054
Processing imageDir:  1053
Processing imageDir:  1071
Processing imageDir:  1082
Processing imageDir:  1076
Processing imageDir:  1049
Processing imageDir:  1049
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1040
Processing imageDir:  1040
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Predicting 1 masks


Unused or unrecognized kwargs: padding.


Predicted 1 masks
Processing imageDir:  1047
Processing imageDir:  1047


In [None]:
def searchProduct(imageFeatures):
    # Process the input image
    query_embedding = imageFeatures.image_embeds.detach().cpu().numpy()


    # Calculate cosine similarities
    similarities = []
    for id, embedding in embeddings.items():
        embedding = embedding.image_embeds.detach().cpu().numpy()
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((id, similarity))

    top_five = sorted(similarities, key=lambda item: item[1], reverse=True)[:5]
    # filtered_results = [(id, sim) for id, sim in top_five if sim > 0.8]
    # Get the product details for the top five items
    filtered_products = []
    for closest_id, maxSimilarity in top_five:
        # closest_id = closest_id.split("_")[0]
        product_details = products[closest_id.split("_")[0]]
        filtered_products.append((product_details, closest_id, maxSimilarity))

    return filtered_products

In [24]:
match = 0
for id, feature in images.items():
    product_id = id.split("_")[0]
    query_product = products[product_id]
    results = searchProduct(feature)
    print("Query: ", query_product, results)
    
    if query_product['id'] == results[0][0]['id']:
    # or query_product['id'] == results[1][0]['id'] or query_product['id'] == results[2][0]['id'] or query_product['id'] == results[3][0]['id'] or query_product['id'] == results[4][0]['id']:
        match += 1
    
    print(query_product['nickname'], "-------", results[0][0]['nickname'])
print("Match: ", match, "Total: ", len(images))

Match:  0 Total:  0


In [23]:

match = 0
for id, feature in images.items():
    product_id = id.split("_")[0]
    query_product = products[product_id]
    results = searchProduct(feature)
    print("Query: ", query_product, results)
    
    if product_id == results[0][0]['id']:
        match += 1
        
        # Plot images in a 2x1 matrix
        fig, axes = plt.subplots(2, 1, figsize=(10, 10))
        
        # Plot the query image
        query_image = image_map[id]
        axes[0].imshow(query_image)
        axes[0].set_title(f"Query: {query_product['nickname']}")
        axes[0].axis('off')
        
        # Plot the matched video frame
        matched_id = results[0][1]
        matched_frame = video_frame_map[matched_id]
        axes[1].imshow(matched_frame)
        axes[1].set_title(f"Matched: {results[0][0]['nickname']}")
        axes[1].axis('off')
        
        plt.show()
    
    print(query_product['nickname'], "-------", results[0][0]['nickname'])
print("Match: ", match, "Total: ", len(images))

Match:  0 Total:  0
