In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
file_path = '/content/drive/MyDrive/'

In [3]:
import pandas as pd
import os

In [4]:
data = pd.read_csv(os.path.join(file_path, 'Amazon Product Dataset 2020.csv'))

In [5]:
# Adjust Pandas display options to show the full string
pd.set_option('display.max_colwidth', None)

In [6]:
data.shape

(10002, 28)

In [7]:
data.head(3)

Unnamed: 0,Uniq Id,Product Name,Brand Name,Asin,Category,Upc Ean Code,List Price,Selling Price,Quantity,Model Number,...,Product Url,Stock,Product Details,Dimensions,Color,Ingredients,Direction To Use,Is Amazon Seller,Size Quantity Variant,Product Description
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fiberglass Longboard Complete",,,"Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards",,,$237.68,,,...,https://www.amazon.com/DB-Longboards-CoreFlex-Fiberglass-Longboard/dp/B07KMVJJK7,,,,,,,Y,,
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, FM Radio, Motion Detector, Music Box (Set of 5)",,,Toys & Games | Learning & Education | Science Kits & Toys,,,$99.95,,55324.0,...,https://www.amazon.com/Electronic-Circuits-Classpack-Motion-Detector/dp/B008AK6DAS,,,,,,,Y,,
2,2c55cae269aebf53838484b0d7dd931a,"3Doodler Create Flexy 3D Printing Filament Refill Bundle (X5 Pack, Over 1000'. of Extruded Plastics! - Innovate",,,Toys & Games | Arts & Crafts | Craft Kits,,,$34.99,,,...,https://www.amazon.com/3Doodler-Plastic-Innovate-Filament-Refills/dp/B07D36747F,,,,,,,Y,,


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                10002 non-null  object 
 1   Product Name           10002 non-null  object 
 2   Brand Name             0 non-null      float64
 3   Asin                   0 non-null      float64
 4   Category               9172 non-null   object 
 5   Upc Ean Code           34 non-null     object 
 6   List Price             0 non-null      float64
 7   Selling Price          9895 non-null   object 
 8   Quantity               0 non-null      float64
 9   Model Number           8230 non-null   object 
 10  About Product          9729 non-null   object 
 11  Product Specification  8370 non-null   object 
 12  Technical Details      9212 non-null   object 
 13  Shipping Weight        8864 non-null   object 
 14  Product Dimensions     479 non-null    object 
 15  Im

In [9]:
null_columns = [2,3,6,8,
               17,19,20,21,22,
                23,24,26,27]

data.drop(data.columns[null_columns], axis =1, inplace=True)

In [16]:
data['Selling Price']

Unnamed: 0,Selling Price
0,$237.68
1,$99.95
2,$34.99
3,$28.91
4,$17.49
...,...
9997,$9.31
9998,$6.99
9999,$37.95
10000,$3.58


In [17]:
dataset = data.copy()

### Cleaning Selling Price

In [18]:
# Clean 'Selling Price' column by removing special characters and extracting numeric values
dataset['Selling Price_Cleaned'] = (
    dataset['Selling Price']
    .str.extract(r'([0-9]+\.?[0-9]*)')  # Extract the first valid numeric pattern
    .astype(float)                      # Convert to float
)

# Fill missing values in 'Selling Price_Cleaned' with 'NA'
dataset['Selling Price_Cleaned'] = dataset['Selling Price_Cleaned'].fillna("NA")

### Cleaning Shipping Weight

In [19]:
def clean_shipping_weight(weight):
    if isinstance(weight, str):
        weight = weight.lower().strip()
        # Extract numeric value and convert to pounds if necessary
        try:
            if "pounds" in weight:
                return float(weight.split()[0])
            elif "ounces" in weight:
                return round(float(weight.split()[0]) / 16, 2)  # Convert ounces to pounds
        except ValueError:
            return "NA"  # Return 'NA' for invalid entries
    return "NA"

# Apply the updated function
dataset['Shipping Weight_Cleaned'] = dataset['Shipping Weight'].apply(clean_shipping_weight)

In [20]:
# Verify the cleaned column
dataset[['Shipping Weight', 'Shipping Weight_Cleaned']].head()

Unnamed: 0,Shipping Weight,Shipping Weight_Cleaned
0,10.7 pounds,10.7
1,4 pounds,4.0
2,12.8 ounces,0.8
3,13.4 ounces,0.84
4,13.4 ounces,0.84


### Cleaning Is Amazon Seller

In [21]:
# Clean 'Is Amazon Seller' column by converting 'Y'/'N' to boolean
dataset['Is Amazon Seller_Cleaned'] = dataset['Is Amazon Seller'].apply(
    lambda x: True if x == 'Y' else False if x == 'N' else "NA"
)

In [22]:
# Verify the cleaned columns
print(dataset[['Is Amazon Seller', 'Is Amazon Seller_Cleaned']].head())

  Is Amazon Seller  Is Amazon Seller_Cleaned
0                Y                      True
1                Y                      True
2                Y                      True
3                Y                      True
4                Y                      True


### Cleaning Text Attributes

In [23]:
# Create cleaned columns while preserving the original
def clean_text(text):
    if isinstance(text, str):
        return (
            text.lower()                           # Convert to lowercase
            .replace("\n", " ")                    # Remove newline characters
            .replace("|", " ")                     # Remove unwanted characters
            .strip()                               # Remove leading/trailing whitespace
        )
    return text

# Apply cleaning to text-based columns and store them in new columns
text_columns = ['Product Name', 'Category', 'About Product', 'Product Specification', 'Technical Details']
for col in text_columns:
    dataset[f'{col}_Cleaned'] = dataset[col].apply(clean_text)

### Verifying Image URLs

In [24]:
# Check if image URLs are valid by ensuring they start with "http"
valid_image_urls = dataset['Image'].str.startswith("http")

# Confirm all image URLs are valid
if valid_image_urls.all():
    print("All image URLs are valid.")

All image URLs are valid.


In [28]:
final_dataset = dataset[['Product Name_Cleaned', 'Category_Cleaned', 'Selling Price_Cleaned',
                         'About Product_Cleaned', 'Product Specification_Cleaned', 'Technical Details_Cleaned',
                         'Shipping Weight_Cleaned', 'Image', 'Product Url', 'Is Amazon Seller_Cleaned']]

To generate embeddings using the CLIP model, we need to carefully choose columns for text and image embeddings:

### **Columns for Text Embeddings**
These columns contain relevant product descriptions and attributes:
1. **`Product Name_Cleaned`**: Core identifier for the product.
2. **`Category_Cleaned`**: Provides context about the product category.
3. **`About Product_Cleaned`**: Detailed textual description of the product.
4. **`Product Specification_Cleaned`**: Technical details for products that may aid retrieval.
5. **`Technical Details_Cleaned`**: Additional details where available.

We will concatenate these columns into a single text representation for each product.

---

### **Columns for Image Embeddings**
1. **`Image`**: Contains URLs for product images. These will be used directly to generate image embeddings.

---

### **Plan**
1. Combine the selected text columns into a single field for text embeddings.
2. Use the `Image` column for image embeddings, downloading the images if necessary.
3. Generate embeddings for both text and images using CLIP's pre-trained model.

In [31]:
# Combine text columns in final_dataset for embedding generation
final_dataset['Combined_Text'] = (
    final_dataset['Product Name_Cleaned'].fillna('') + ' ' +
    final_dataset['Category_Cleaned'].fillna('') + ' ' +
    final_dataset['About Product_Cleaned'].fillna('') + ' ' +
    final_dataset['Product Specification_Cleaned'].fillna('') + ' ' +
    final_dataset['Technical Details_Cleaned'].fillna('')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['Combined_Text'] = (


In [32]:
final_dataset['Combined_Text']

Unnamed: 0,Combined_Text
0,"db longboards coreflex crossbow 41"" bamboo fiberglass longboard complete sports & outdoors outdoor recreation skates, skateboards & scooters skateboarding standard skateboards & longboards longboards make sure this fits by entering your model number. responsive flex: the crossbow features a bamboo core encased in triaxial fiberglass and hd plastic for a responsive flex pattern that’s second to none. pumping & carving have never been so satisfying! flex 2 is recommended for people 120 to 170 pounds. coreflex tech: coreflex construction is water resistant, impact resistant, scratch resistant and has a flex like you won’t believe. these boards combine fiberglass, epoxy, hd plastic and bamboo to create a perfect blend of performance and strength. inspired by the northwest: our founding ideal is chasing adventure & riding the best boards possible, inspired by the hills, waves, beaches & mountains all around our headquarters in the northwest best in the world: db was founded out of sheer love of longboarding with a mission to create the best custom longboards in the world, to do it sustainably, & to treat customers & employees like family beyond compare: try our skateboards & accessories if you've tried similar products by sector 9, landyachtz, arbor, loaded, globe, orangatang, hawgs, powell-peralta, blood orange, caliber or gullwing shipping weight: 10.7 pounds (view shipping rates and policies) asin: b07kmvjjk7 #474 in longboards skateboard"
1,"electronic snap circuits mini kits classpack, fm radio, motion detector, music box (set of 5) toys & games learning & education science kits & toys make sure this fits by entering your model number. snap circuits mini kits classpack provides basic electronic circuitry activities for students in grades 2-6 includes 5 separate mini building kits- an fm radio, a motion detector, music box, space battle sound effects, and a flying saucer each kit includes separate components and instructions to build each component represents one function in a circuit; components snap together to create working models of everyday electronic devices activity guide provides additional projects to teach students how circuitry works product dimensions: 14.7 x 11.1 x 10.2 inches ; 4.06 pounds shipping weight: 4 pounds (view shipping rates and policies) domestic shipping: item can be shipped within u.s. international shipping: this item can be shipped to select countries outside of the u.s. learn more asin: b008ak6das item model number: 55324 #3032 in science kits & toys the snap circuits mini kits classpack provides basic electric circuitry information for students in grades 2-6. this classpack includes 5 snap-together building kits. components snap together to create working models of everyday electronic devices. kits included are an fm radio, a motion detector, a music box, space battle sound effects, and a flying saucer. each mini kit comes with individual components, and an activity guide which includes instructions and additional project ideas. each primary-colored component represents one function in a circuit. activity kits are used by teachers and students in classroom and homeschool settings for educational and research applications in science, math, and for a variety of additional disciplines. science education products and manipulatives incorporate applied math and science principles into classroom or homeschool projects. teachers in pre-k, elementary, and secondary classrooms use science education kits, manipualtives, and products alongside science, technology, engineering, and math (stem) curriculum to demonstrate stem concepts and real-world applications through hands-on activities. science education projects include a broad range of activities, such as practical experiments in engineering, aeronautics, robotics, chemistry, physics, biology, and geology."
2,"3doodler create flexy 3d printing filament refill bundle (x5 pack, over 1000'. of extruded plastics! - innovate toys & games arts & crafts craft kits make sure this fits by entering your model number. ✅【smooth 3d drawing experienced the best 3d drawing experience by only using 3doodler create plastics with 3doodler create+ and create 3d printing pen. ✅【safe to use】the 3doodler create plastics, conforms to the health requirements of astm-d-4236 & require no additional labelling in accordance with the us consumer product safety commission’s regulations as mandated by labeling of hazardous art materials act (lhama). 👍【3doodler very own type of plastic】the flexy plastic takes creativity to new levels! you can make flexible fun 3d creations! 🌍【environmentally friendly】3doodler create plastic is made of corn and are 100% compostable! ✅【125 strands of drawing fun】this bundle includes 5 refill filament packs, that's a total of 1043 ft. of 3d drawing and doodling fun! 📱【the 3doodler app】get an interactive experience! the app is packed with dedicated easy to follow stencil section and step by step interactive instructions, receive badges for completed projects and photograph & share your creations directly on social media. the app is fully built on ios & android. ✅【all your favorite colors】this pack includes: green, blue, pink, orange & yellow productdimensions:10.3x3.4x0.8inches itemweight:12.8ounces shippingweight:12.8ounces(viewshippingratesandpolicies) asin:b07d36747f manufacturerrecommendedage:14yearsandup show up to 2 reviews by default no longer are you bound by the rigid constraints of hard plastic! our flexy line you can now squeeze, stretch, and twist your creations providing a truly dynamic doodling experience. do you want to take your creativity to new levels? explore the wide variety of flexy plastic refill colors for your 3doodler create 3d pen! flexy plastics are compatible with the 3doodler v.1, 2.0, and create 3d printing pens. available in single & mixed color pack containing 25 strands each, and single colors tubes containing 100 strands. 12.8 ounces (view shipping rates and policies)"
3,"guillow airplane design studio with travel case building kit toys & games hobbies models & model kits model kits airplane & jet kits make 8 different planes at one time. experiment with different designs and learn about flight. all contained in part trays inside an attractive box with carry handle! guillow's - your one source for flying toys since 1926. made in the usa. productdimensions:3.5x6.2x13inches itemweight:13.4ounces shippingweight:13.4ounces(viewshippingratesandpolicies) domesticshipping:itemcanbeshippedwithinu.s. internationalshipping:thisitemcanbeshippedtoselectcountriesoutsideoftheu.s.learnmore asin:b076y2snhm itemmodelnumber:142 manufacturerrecommendedage:8yearsandup go to your orders and start the return select the ship method ship it! go to your orders and start the return select the ship method ship it! 13.4 ounces (view shipping rates and policies) show up to 2 reviews by default kit contains parts to let your imagination soar and build flying planes! comes with over 35 parts (balsa wood wings, tails & bodies, plastic propellers and landing gear, rubber motors, etc) to let you build your own unique flying toy designs. simple to build, easy to fly and hours of fun!"
4,woodstock- collage 500 pc puzzle toys & games puzzles jigsaw puzzles make sure this fits by entering your model number. puzzle has 500 pieces completed puzzle measure 14 x 19 100% officially licensed merchandise great for fans & puzzlers alike productdimensions:1.9x8x10inches itemweight:13.4ounces shippingweight:13.4ounces(viewshippingratesandpolicies) asin:b07mx21wwx itemmodelnumber:62151 manufacturerrecommendedage:14yearsandup show up to 2 reviews by default 100% officially licensed merchandise; complete puzzle measures 14 x 19 in. 13.4 ounces (view shipping rates and policies)
...,...
9997,"remedia publications rem536b money activity book, grade: 3 to 4, 8.5"" wide, 11"" length, 0.4"" height toys & games learning & education counting & math toys product dimensions: 11 x 8.5 x 0.4 inches ; 1.6 ounces shipping weight: 4.8 ounces (view shipping rates and policies) asin: b000f8xiz6 item model number: rem536b #2593 in counting & math toys #10623 in preschool learning toys show up to 2 reviews by default activities include identifying coins, learning their value, counting coins, matching coins to prices, different ways of showing an amount, and simple addition of money. also includes making change and word problems. this book is ideal for introductory-level teaching or enrichment! grades 3-4."
9998,"trends international nfl la chargers hg - mobile wallet toys & games arts & crafts make sure this fits by entering your model number. dimensions: 6.375''x3''x0.25'' strong, reliable 3m adhesive that allows for easy peel and stick application and repositioning wallet function carries cards or cash keep cords organized with a snap snap out stand to watch videos with ease productdimensions:3x6.5x0.3inches itemweight:0.96ounces shippingweight:0.96ounces(viewshippingratesandpolicies) asin:b07pj181tc manufacturerrecommendedage:4-13years the mobile wallet is made of silicone with strong adhesive on the back. it fits any moblie phone, and holds credit cards, business cards, id's or cash. great for personalization and customization. 0.96 ounces (view shipping rates and policies)"
9999,"newpath learning 10 piece science owls and owl pellets curriculum mastery flip chart set, grade 5-9 office products office & school supplies education & crafts classroom science supplies make sure this fits by entering your model number. ""write-on/wipe-off"" activities perfect for hands-on review of standards-based skills each set contains 10 double-sided charts mounted on durable easel perfect for learning center or independent use includes activity guide featuring black-line copy-masters and exercises product dimensions: 18 x 11.9 x 0.7 inches ; 2.75 pounds shipping weight: 2.8 pounds (view shipping rates and policies) domestic shipping: item can be shipped within u.s. international shipping: this item can be shipped to select countries outside of the u.s. learn more asin: b00dog823y item model number: 34-6015 #2441 in school science supplies #7429 in science kits & toys #10696 in science education supplies show up to 2 reviews by default newpath learning standards-based science owls and owl pellets curriculum mastery flip chart set. each curriculum mastery flip chart is mounted on a sturdy easel. 10 double-sided, laminated 18-inches length by 12-inches width charts. side 1 features a colorful, graphic overview of the topic. side 2 serves as a ""write-on/wipe-off"" activity chart featuring questions, labeling exercises, vocabulary review and more. activity guide featuring blackline copymasters and exercises. set includes the following 10 charts: owls-birds of prey; the barn owl; owl food web; owl pellets; dissecting an owl pellet; rodent skeleton; vole skeleton; mole skeleton; shrew skeleton; bird skeleton. grade 5 - 9. academic standard: california, florida, national, new york, texas."
10000,disney princess do it yourself braid set toys & games arts & crafts craft kits productdimensions:7x7x2inches itemweight:6.1ounces shippingweight:6.1ounces(viewshippingratesandpolicies) asin:b076d3p6sw itemmodelnumber:2888prst manufacturerrecommendedage:0monthsandup disney princess diy braid set 6.1 ounces (view shipping rates and policies)


### Text Embedding Generation

In [35]:
from transformers import CLIPProcessor, CLIPModel
import torch

In [36]:
# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [37]:
# Function to generate text embeddings using CLIP
def generate_text_embeddings(texts):
    inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model.get_text_features(**inputs)
    return embeddings

In [38]:
# Generate embeddings for the combined text
texts = final_dataset['Combined_Text'].tolist()
text_embeddings = generate_text_embeddings(texts)

In [39]:
# Verify embeddings shape
print(text_embeddings.shape)  # Output: (num_samples, embedding_dim)

torch.Size([10002, 512])


### Image Embedding Generation

### **Explanation**

1. **Image Preprocessing**:
   - Downloads images from the URLs provided in the `Image` column.
   - Uses the first image URL when multiple URLs are separated by `|`.

2. **Embedding Generation**:
   - Passes the preprocessed images through the CLIP model to generate image embeddings.

3. **Fallback Handling**:
   - For invalid or missing image URLs, it adds a zero vector as a placeholder.

This code will generate embeddings for all valid images in the dataset.

In [40]:
from PIL import Image
from io import BytesIO
import requests
import torch

In [41]:
def preprocess_image(image_url, timeout=5):
    try:
        response = requests.get(image_url, timeout=timeout)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content)).convert("RGB")
            inputs = processor(images=image, return_tensors="pt")
            return inputs
        else:
            print(f"Invalid response for URL: {image_url}")
            return None
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return None

In [42]:
def generate_image_embeddings(image_urls, max_retries=3):
    embeddings = []
    embedding_dim = model.config.projection_dim  # Get the embedding dimension from the model
    for url in image_urls:
        main_url = url.split('|')[0]  # Use the first image URL
        inputs = None
        for _ in range(max_retries):  # Retry up to max_retries times
            inputs = preprocess_image(main_url)
            if inputs:  # If successful, break retry loop
                break
        if inputs:
            with torch.no_grad():
                embedding = model.get_image_features(**inputs)
                embeddings.append(embedding)
        else:
            # Append a zero vector with the same shape as embeddings
            print(f"Failed to process image: {main_url}")
            embeddings.append(torch.zeros(1, embedding_dim))  # Ensure correct shape
    return torch.cat(embeddings, dim=0)  # Concatenate tensors along the batch dimension

In [43]:
# Generate image embeddings with improved handling
image_urls = final_dataset['Image'].tolist()
image_embeddings = generate_image_embeddings(image_urls)

Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Failed to process image: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Failed to process image: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Failed to process image: https://m.media-amazon.com/images/I/A13usaonutL.jpg
Invalid response for URL: https://m.media-amazon.com/images/I/A1nYN

In [44]:
# Verify embeddings shape
print(image_embeddings.shape)  # Output: (num_samples, embedding_dim)

torch.Size([10002, 512])


### Store the embeddings for both text and images in a vector database for efficient retrieval.

In [45]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [46]:
import faiss
import numpy as np

# Combine embeddings into a single matrix
# Combination of Text and Image: Using both modalities together ensures the retrieval system considers both textual and visual similarities.
combined_embeddings = torch.cat((text_embeddings, image_embeddings), dim=1).numpy()

# Initialize FAISS index
d = combined_embeddings.shape[1]  # Dimension of embeddings
# Initializes a FAISS index for exact nearest neighbor search using the L2 (Euclidean) distance metric.
index = faiss.IndexFlatL2(d)  # L2 distance metric

# Add embeddings to the index
index.add(combined_embeddings)

# Save the FAISS index for future use
# his allows you to avoid recomputing the index every time you restart your program
faiss.write_index(index, "multimodal_embeddings.index")

In [47]:
combined_embeddings.shape

(10002, 1024)

### Build a robust system to retrieve the most relevant items based on user queries (text or image).

### Text Query:

Use only the text part of the combined_embeddings (first 512 dimensions).
Search within a FAISS index built for the text embeddings.

### Image Query:

Use only the image part of the combined_embeddings (last 512 dimensions).
Search within a FAISS index built for the image embeddings.

### Multimodal Query:

Combine the text and image embeddings into a single query (1024 dimensions).
Search within the original FAISS index built for the full combined_embeddings.

### Create Separate FAISS Indices

In [49]:
# Text-only FAISS index
text_embeddings = np.ascontiguousarray(combined_embeddings[:, :512])
text_index = faiss.IndexFlatL2(512)
text_index.add(text_embeddings)

# Image-only FAISS index
image_embeddings = np.ascontiguousarray(combined_embeddings[:, 512:])
image_index = faiss.IndexFlatL2(512)
image_index.add(image_embeddings)

# Full multimodal FAISS index
full_embeddings = np.ascontiguousarray(combined_embeddings)
full_index = faiss.IndexFlatL2(1024)
full_index.add(full_embeddings)

In [55]:
# Example query: Generate query embedding
query_embedding = generate_text_embeddings(["DB Longboards CoreFlex Crossbow 41 Bamboo Fiberglass Longboard Complete"]).numpy()

# Perform search in FAISS index
distances, indices = text_index.search(query_embedding, k=10)  # Using text-only FAISS index

# Define ground-truth indices for this query
ground_truth = {0}  # Indices of relevant items

# Evaluate for different cutoff levels
accuracy, recall_at_1 = evaluate_retrieval(indices.flatten(), ground_truth, k=1)
_, recall_at_5 = evaluate_retrieval(indices.flatten(), ground_truth, k=5)
_, recall_at_10 = evaluate_retrieval(indices.flatten(), ground_truth, k=10)

print(f"Accuracy: {accuracy:.2f}")
print(f"Recall@1: {recall_at_1:.2f}")
print(f"Recall@5: {recall_at_5:.2f}")
print(f"Recall@10: {recall_at_10:.2f}")

Accuracy: 0.10
Recall@1: 1.00
Recall@5: 1.00
Recall@10: 1.00


In [56]:
pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.5
    Uninstalling openai-1.54.5:
      Successfully uninstalled openai-1.54.5
Successfully installed openai-0.28.0


In [57]:
pip show openai

Name: openai
Version: 0.28.0
Summary: Python client library for the OpenAI API
Home-page: https://github.com/openai/openai-python
Author: OpenAI
Author-email: support@openai.com
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, requests, tqdm
Required-by: 


### This function interacts with GPT-4 to generate context-aware responses based on retrieved items:

### Extend for Image Queries: support for image queries by using the image FAISS index

In [59]:
# Assuming embeddings are stored in `combined_embeddings`, `text_embeddings`, and `image_embeddings`

# Text-only FAISS index
text_index = faiss.IndexFlatL2(512)
text_embeddings = np.ascontiguousarray(combined_embeddings[:, :512])  # Text dimensions
text_index.add(text_embeddings)
faiss.write_index(text_index, "text_only.index")

# Image-only FAISS index
image_index = faiss.IndexFlatL2(512)
image_embeddings = np.ascontiguousarray(combined_embeddings[:, 512:])  # Image dimensions
image_index.add(image_embeddings)
faiss.write_index(image_index, "image_only.index")

# Full multimodal FAISS index
full_index = faiss.IndexFlatL2(1024)
full_index.add(combined_embeddings)
faiss.write_index(full_index, "multimodal_embeddings.index")

In [61]:
from transformers import CLIPProcessor, CLIPModel
import torch
import faiss
import openai
from PIL import Image
import requests
from io import BytesIO

# Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load the FAISS indices
text_index = faiss.read_index("text_only.index")
image_index = faiss.read_index("image_only.index")
full_index = faiss.read_index("multimodal_embeddings.index")

In [62]:
# Save to Google Drive
drive_path = "/content/drive/My Drive/FAISS_Indices/"
os.makedirs(drive_path, exist_ok=True)

faiss.write_index(text_index, os.path.join(drive_path, "text_only.index"))
faiss.write_index(image_index, os.path.join(drive_path, "image_only.index"))
faiss.write_index(full_index, os.path.join(drive_path, "multimodal_embeddings.index"))

In [65]:
print(f"Text Index: {text_index.ntotal} entries")
print(f"Image Index: {image_index.ntotal} entries")
print(f"Multimodal Index: {full_index.ntotal} entries")

Text Index: 10002 entries
Image Index: 10002 entries
Multimodal Index: 10002 entries


In [None]:
# Save final_dataset to Google Drive
drive_path = "/content/drive/My Drive/FAISS_Indices/"
os.makedirs(drive_path, exist_ok=True)

final_dataset_path = os.path.join(drive_path, "final_dataset.csv")
final_dataset.to_csv(final_dataset_path, index=False)

print(f"Final dataset saved to Google Drive at: {final_dataset_path}")

Features:
- Accept user input (text or image).
- Display retrieved results and generated answers.