In [1]:
from futils import *
llms=get_llms()

#llms[1].invoke("I need t-shirt for my son")

In [6]:
from classifier import Classifier

c=Classifier(llms[0])

q="some men socks"
r=c.classify(q)
r

ClassifierOutput(category=['Socks'], number_of_items=5)

In [None]:
# from PIL import Image
# from clip import generate_embeddings
# image = Image.open("input_image.jpg")

# embedding = generate_embeddings(text=q)

In [None]:
import glob
import os

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS # Import FAISS
from PIL import Image
# from blip2 import generate_embeddings
from clip import generate_embeddings
import pandas as pd
# Instantiate the embedding model
# This is needed by FAISS.from_embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/modernbert-embed-base")

# Define the path where the FAISS index will be saved
FAISS_INDEX_PATH = "faiss_image_clip_index"



try:
    #Read dataset metadata file

    df=pd.read_csv("/media/asif-iqbal-khan/New Volume/Code/multimodal_search_agent/archive/fashion-dataset/styles.csv", on_bad_lines='skip')

    # Read FAISS index if already created
    faiss_db=None
    
    faiss_db = FAISS.load_local(f'{FAISS_INDEX_PATH}', hf_embeddings, allow_dangerous_deserialization=True)
    print(f'Loaded FAISS index with {faiss_db.index.ntotal} vectors.')
    
except Exception as e:
     print(f"Error loading Index: {e}")


if __name__ == "__main__":

    # generate image embeddings
    # save path to image in text (text=image path)
    # save metadata (category etc.)

    texts = [] # Will store image paths
    embeddings = [] # Will store embedding vectors
    metadatas = [] # Will store metadata dictionaries

    limit = 300 # Process up to 'limit + 1' images (0 to 1000)
    skip = 10 # Skip the first 'skip' images
    print("Starting image processing and embedding generation...")

    # Use os.walk or glob carefully. glob is fine here if structure is simple.
    # Use os.sep for cross-platform path compatibility.
    image_files = glob.glob(os.path.join("archive", "fashion-dataset", "images", "*"))
    print(f"Found {len(image_files)} image files.")


    for i, img_path in enumerate(image_files):
        if i<=skip:
            continue
        if i > skip+limit:
            print(f"Reached limit of {limit}, stopping.")
            break

        # Extract image ID from filename (e.g., '12345.jpg' -> '12345')
        # Use os.path.basename and os.path.splitext for robustness
        img_filename = os.path.basename(img_path)
        id = os.path.splitext(img_filename)[0]
        
        # Lookup metadata in the DataFrame
        try:
            # Filter df for the current image ID           
            
            f_df = df[df['id'] == int(id)]
            
            if f_df.empty:
                print(f"Warning: No data found in df for image id {id}. Skipping image {img_path}.")
                continue # Skip this image if no metadata is found

            # Extract metadata from the first matching row (assuming ID is unique)

            cat = f_df['articleType'].iloc[0]
            gender = f_df['gender'].iloc[0]
            s_cat = f_df['subCategory'].iloc[0]

            print(f"Processing {i}: {img_path} - {cat}, {s_cat}, {gender}") # Suppress frequent printing

        except KeyError as e:
             print(f"Error accessing DataFrame column for {img_path}: {e}. Skipping.")
             continue # Skip if metadata column is missing
        except Exception as e:
             print(f"An unexpected error occurred processing DataFrame for {img_path}: {e}. Skipping.")
             continue

        try:
            # Open the image and generate embedding
            image = Image.open(img_path).convert("RGB") # Ensure RGB format
            embedding = generate_embeddings(image=image)

            # Ensure the embedding is a list of floats
            if hasattr(embedding, 'tolist'): # Check if it's a numpy array or similar
                 embedding_list = embedding.tolist()
            else:
                 # Assume it's already a list or convertible directly
                 embedding_list = list(embedding)

            texts.append(img_path) # Store the image path as the 'text'
            embeddings.append(embedding_list)
            metadatas.append({"category": cat, "subCategory": s_cat, "gender": gender}) # Add all relevant metadata

        except FileNotFoundError:
             print(f"Error: Image file not found at {img_path}. Skipping.")
             continue
        except Exception as e:
             print(f"Error processing image or generating embedding for {img_path}: {e}. Skipping.")
             continue

    # Check if any data was successfully processed
    if not texts:
        print("No images processed or embeddings generated. Nothing to create FAISS index from.")
        exit()

    print(f"Finished processing {len(texts)} images.")
    print("Creating FAISS index from collected embeddings and metadata...")

    # FAISS.from_embeddings requires a list of tuples (text, embedding)
    text_embeddings = list(zip(texts, embeddings))

    # Create the FAISS index using the from_embeddings class method
    # This method handles the internal index, docstore, etc. creation correctly.
    if faiss_db is None:
        faiss_db = FAISS.from_embeddings(
            text_embeddings=text_embeddings, # List of (text, embedding) tuples
            embedding=hf_embeddings,        # The embedding function object
            metadatas=metadatas             # List of metadata dictionaries, aligned with text_embeddings
        )
    else:
        faiss_db.add_embeddings(
            text_embeddings=text_embeddings, # List of (text, embedding) tuples
            embedding=hf_embeddings,        # The embedding function object
            metadatas=metadatas             # List of metadata dictionaries, aligned with text_embeddings
        )
    print("FAISS index created in memory.")
    # --- End Modification ---


    # Save the FAISS index to disk for persistence
    # This creates two files: index.faiss and index.pkl (for docstore/metadata)
    print(f"Saving FAISS index to directory: {FAISS_INDEX_PATH}")
    faiss_db.save_local(FAISS_INDEX_PATH)
    print("FAISS index saved successfully.")

    # --- How to Load the Index Later ---
    print("\n---")
    print(f"To load this FAISS index later for searching, use:")
    print(f"from langchain_community.vectorstores import FAISS")
    print(f"from langchain_huggingface.embeddings import HuggingFaceEmbeddings")
    print(f"# IMPORTANT: Use the SAME embedding model as used for creation")
    print(f"loaded_embeddings = HuggingFaceEmbeddings(model_name='nomic-ai/modernbert-embed-base')")
    print(f"loaded_faiss_db = FAISS.load_local('{FAISS_INDEX_PATH}', loaded_embeddings)")
    print(f"print(f'Loaded FAISS index with {{loaded_faiss_db.index.ntotal}} vectors.')")
    print("---")

Loaded FAISS index with 10 vectors.
Starting image processing and embedding generation...
Found 44441 image files.
Processing 11: archive/fashion-dataset/images/9972.jpg - Tshirts, Topwear, Women
Processing 12: archive/fashion-dataset/images/9973.jpg - Sweatshirts, Topwear, Men
Processing 13: archive/fashion-dataset/images/9974.jpg - Tshirts, Topwear, Men
Processing 14: archive/fashion-dataset/images/9975.jpg - Tshirts, Topwear, Men
Processing 15: archive/fashion-dataset/images/9976.jpg - Sweatshirts, Topwear, Men
Processing 16: archive/fashion-dataset/images/9977.jpg - Caps, Headwear, Unisex
Processing 17: archive/fashion-dataset/images/9978.jpg - Caps, Headwear, Unisex
Processing 18: archive/fashion-dataset/images/9980.jpg - Jackets, Topwear, Men
Processing 19: archive/fashion-dataset/images/9981.jpg - Jackets, Topwear, Men
Processing 20: archive/fashion-dataset/images/9982.jpg - Tshirts, Topwear, Men
Processing 21: archive/fashion-dataset/images/9983.jpg - Tshirts, Topwear, Men
Proc

In [None]:
#Create Vector store from images only without any csv file metadata or filters


import glob
import os

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS # Import FAISS
from PIL import Image
from blip2 import generate_embeddings
import pandas as pd
# Instantiate the embedding model
# This is needed by FAISS.from_embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/modernbert-embed-base")

# Define the path where the FAISS index will be saved
FAISS_INDEX_PATH = "faiss_screenshots_index"



try:
    #Read dataset metadata file

    #df=pd.read_csv("/media/asif-iqbal-khan/New Volume/Code/multimodal_search_agent/archive/fashion-dataset/styles.csv", on_bad_lines='skip')

    # Read FAISS index if already created
    faiss_db=None
    
    faiss_db = FAISS.load_local(f'{FAISS_INDEX_PATH}', hf_embeddings, allow_dangerous_deserialization=True)
    print(f'Loaded FAISS index with {faiss_db.index.ntotal} vectors.')
    
except Exception as e:
     print(f"Error loading Index: {e}")


if __name__ == "__main__":

    # generate image embeddings
    # save path to image in text (text=image path)
    # save metadata (category etc.)

    texts = [] # Will store image paths
    embeddings = [] # Will store embedding vectors
    metadatas = [] # Will store metadata dictionaries

    limit = 2030 # Process up to 'limit + 1' images (0 to 1000)
    skip = 0 # Skip the first 'skip' images
    print("Starting image processing and embedding generation...")

    # Use os.walk or glob carefully. glob is fine here if structure is simple.
    # Use os.sep for cross-platform path compatibility.
    #image_files = glob.glob(os.path.join("archive", "fashion-dataset", "images", "*"))
    image_files = glob.glob(os.path.join("Screenshots", "*"))
    print(f"Found {len(image_files)} image files.")


    for i, img_path in enumerate(image_files):
        if i<=skip:
            continue
        if i > skip+limit:
            print(f"Reached limit of {limit}, stopping.")
            break

        # Extract image ID from filename (e.g., '12345.jpg' -> '12345')
        # Use os.path.basename and os.path.splitext for robustness
        img_filename = os.path.basename(img_path)
        id = os.path.splitext(img_filename)[0]
        
        print("processing image", img_path)
        try:
            # Open the image and generate embedding
            image = Image.open(img_path).convert("RGB") # Ensure RGB format
            embedding = generate_embeddings(image=image)

            # Ensure the embedding is a list of floats
            if hasattr(embedding, 'tolist'): # Check if it's a numpy array or similar
                 embedding_list = embedding.tolist()
            else:
                 # Assume it's already a list or convertible directly
                 embedding_list = list(embedding)

            texts.append(img_path) # Store the image path as the 'text'
            embeddings.append(embedding_list)
            #metadatas.append({"category": cat, "subCategory": s_cat, "gender": gender}) # Add all relevant metadata

        except FileNotFoundError:
             print(f"Error: Image file not found at {img_path}. Skipping.")
             continue
        except Exception as e:
             print(f"Error processing image or generating embedding for {img_path}: {e}. Skipping.")
             continue

    # Check if any data was successfully processed
    if not texts:
        print("No images processed or embeddings generated. Nothing to create FAISS index from.")
        exit()

    print(f"Finished processing {len(texts)} images.")
    print("Creating FAISS index from collected embeddings and metadata...")

    # FAISS.from_embeddings requires a list of tuples (text, embedding)
    text_embeddings = list(zip(texts, embeddings))

    # Create the FAISS index using the from_embeddings class method
    # This method handles the internal index, docstore, etc. creation correctly.
    if faiss_db is None:
        faiss_db = FAISS.from_embeddings(
            text_embeddings=text_embeddings, # List of (text, embedding) tuples
            embedding=hf_embeddings,        # The embedding function object
            #metadatas=metadatas             # List of metadata dictionaries, aligned with text_embeddings
        )
    else:
        faiss_db.add_embeddings(
            text_embeddings=text_embeddings, # List of (text, embedding) tuples
            embedding=hf_embeddings,        # The embedding function object
            metadatas=metadatas             # List of metadata dictionaries, aligned with text_embeddings
        )
    print("FAISS index created in memory.")
    # --- End Modification ---


    # Save the FAISS index to disk for persistence
    # This creates two files: index.faiss and index.pkl (for docstore/metadata)
    print(f"Saving FAISS index to directory: {FAISS_INDEX_PATH}")
    faiss_db.save_local(FAISS_INDEX_PATH)
    print("FAISS index saved successfully.")

    # --- How to Load the Index Later ---
    print("\n---")
    print(f"To load this FAISS index later for searching, use:")
    print(f"from langchain_community.vectorstores import FAISS")
    print(f"from langchain_huggingface.embeddings import HuggingFaceEmbeddings")
    print(f"# IMPORTANT: Use the SAME embedding model as used for creation")
    print(f"loaded_embeddings = HuggingFaceEmbeddings(model_name='nomic-ai/modernbert-embed-base')")
    print(f"loaded_faiss_db = FAISS.load_local('{FAISS_INDEX_PATH}', loaded_embeddings)")
    print(f"print(f'Loaded FAISS index with {{loaded_faiss_db.index.ntotal}} vectors.')")
    print("---")

Error loading Index: Error in faiss::FileIOReader::FileIOReader(const char*) at /project/faiss/faiss/impl/io.cpp:67: Error: 'f' failed: could not open faiss_screenshots_index/index.faiss for reading: No such file or directory
Starting image processing and embedding generation...
Found 82 image files.
processing image Screenshots/Screenshot_20250501-080152.png
processing image Screenshots/Screenshot_20250502-061756.png
processing image Screenshots/Screenshot_20250502-235311.png
processing image Screenshots/Screenshot_20250503-112817.png
processing image Screenshots/Screenshot_20250503-112820.png
processing image Screenshots/Screenshot_20250504-014757.png
processing image Screenshots/Screenshot_20250505-154346.png
processing image Screenshots/Screenshot_20250506-075709.png
processing image Screenshots/Screenshot_20250506-144541.png
processing image Screenshots/Screenshot_20250506-172720.png
processing image Screenshots/Screenshot_20250508-105714.png
processing image Screenshots/Screensho

ValueError: texts and metadatas expected to be equal length but len(texts)=81 and len(metadatas)=0

In [None]:

#using Postrgres DB
import glob
import os

from dotenv import load_dotenv
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_postgres.vectorstores import PGVector
from PIL import Image

from blip2 import generate_embeddings

load_dotenv("env/connection.env")

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.getenv("DRIVER"),
    host=os.getenv("HOST"),
    port=os.getenv("PORT"),
    database=os.getenv("DATABASE"),
    user='admin',#os.getenv("USERNAME"),
    password=os.getenv("PASSWORD"),
)

vector_db = PGVector(
    embeddings=None,#HuggingFaceEmbeddings(model_name="nomic-ai/modernbert-embed-base"),  # does not matter for our use case
    collection_name="fashion",
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

if __name__ == "__main__":

    # generate image embeddings
    # save path to image in text
    # save category in metadata
    texts = []
    embeddings = []
    metadatas = []


    limit=1000
    for i,img in enumerate(glob.glob("archive/fashion-dataset/images/*")):
        
        if i<=-1:
            continue
        id = img.split("/")[-1].split(".")[0]
        f_df = df.filter(pd.col("id") == int(id))
        cat=f_df['articleType'][0]
        gender=f_df['gender'][0]
        s_cat=f_df['subCategory'][0]
        
        print(i,":",img," ",cat,s_cat, gender)
        texts.append(img)
        embeddings.append(generate_embeddings(image=Image.open(img)).tolist())
        metadatas.append({"category": cat})

        if i > limit:
            break   
        
    vector_db.add_embeddings(texts, embeddings, metadatas)

0 : archive/fashion-dataset/images/9961.jpg   Tshirts Topwear Men
1 : archive/fashion-dataset/images/9962.jpg   Tshirts Topwear Men
2 : archive/fashion-dataset/images/9963.jpg   Tshirts Topwear Men
3 : archive/fashion-dataset/images/9964.jpg   Tshirts Topwear Women
4 : archive/fashion-dataset/images/9965.jpg   Wristbands Sports Accessories Unisex
5 : archive/fashion-dataset/images/9966.jpg   Tshirts Topwear Women
6 : archive/fashion-dataset/images/9967.jpg   Track Pants Bottomwear Men
7 : archive/fashion-dataset/images/9968.jpg   Tshirts Topwear Men
8 : archive/fashion-dataset/images/9969.jpg   Tshirts Topwear Men
9 : archive/fashion-dataset/images/9970.jpg   Sweatshirts Topwear Men
10 : archive/fashion-dataset/images/9971.jpg   Sweatshirts Topwear Men
11 : archive/fashion-dataset/images/9972.jpg   Tshirts Topwear Women
12 : archive/fashion-dataset/images/9973.jpg   Sweatshirts Topwear Men
13 : archive/fashion-dataset/images/9974.jpg   Tshirts Topwear Men
14 : archive/fashion-dataset/i

In [None]:
faiss_db.docstore._dict

[Document(id='a60a4a2a-5b8c-4834-afb0-f8fbebb5a254', metadata={'category': 'Tshirts'}, page_content='archive/fashion-dataset/images/9962.jpg'),
 Document(id='303246d8-55c4-4261-94df-26fcea1114a9', metadata={'category': 'Tshirts'}, page_content='archive/fashion-dataset/images/9963.jpg')]