In [6]:
!pip install annoy



In [1]:
!pip show annoy

Name: annoy
Version: 1.17.3
Summary: Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk.
Home-page: https://github.com/spotify/annoy
Author: Erik Bernhardsson
Author-email: mail@erikbern.com
License: Apache License 2.0
Location: /root/.local/lib/python3.10/site-packages
Requires: 
Required-by: 


In [2]:
!pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-text
Successfully installed tensorflow-text-2.17.0


In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from annoy import AnnoyIndex

# 1. Load the dataset
data = pd.read_csv('/content/amazon.csv')

# 2. NLP Preprocessing
def preprocess_text(text):
    text = text.lower()
    return text

data['product_name'] = data['product_name'].apply(preprocess_text)
data['category'] = data['category'].apply(preprocess_text)
data['about_product'] = data['about_product'].apply(preprocess_text)

# 3. Tokenization and Embedding
# Define tokenizer
tokenizer = Tokenizer()
texts = data['product_name'].tolist() + data['category'].tolist() + data['about_product'].tolist()
tokenizer.fit_on_texts(texts)

# Convert text to sequences
data['product_name_seq'] = tokenizer.texts_to_sequences(data['product_name'])
data['category_seq'] = tokenizer.texts_to_sequences(data['category'])
data['about_product_seq'] = tokenizer.texts_to_sequences(data['about_product'])

# Define max length and padding
max_len = 50
data['product_name_seq'] = pad_sequences(data['product_name_seq'], maxlen=max_len, padding='post').tolist()
data['category_seq'] = pad_sequences(data['category_seq'], maxlen=max_len, padding='post').tolist()
data['about_product_seq'] = pad_sequences(data['about_product_seq'], maxlen=max_len, padding='post').tolist()

# Embedding layer
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50  # You can adjust this dimension

embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

def get_embeddings(seq_list):
    # Convert the sequence to a tensor
    seq_tensor = tf.convert_to_tensor([seq_list], dtype=tf.int32)
    # Get the embedding and convert it to a NumPy array
    embedding = embedding_layer(seq_tensor).numpy()
    # Return the mean of the embeddings along the sequence length
    return np.mean(embedding, axis=1)[0]

# Apply embedding to each column
data['product_name_emb'] = data['product_name_seq'].apply(get_embeddings)
data['category_emb'] = data['category_seq'].apply(get_embeddings)
data['about_product_emb'] = data['about_product_seq'].apply(get_embeddings)

# 4. Combine embeddings into a single vector for each product
data['combined_emb'] = data.apply(lambda row: np.mean([
    row['product_name_emb'],
    row['category_emb'],
    row['about_product_emb']
], axis=0), axis=1)

# 5. Build the Annoy Index
dimension = embedding_dim  # The dimension used in the embedding layer
annoy_index = AnnoyIndex(dimension, 'angular')

# Add items to Annoy index
for i, embedding in enumerate(data['combined_emb']):
    annoy_index.add_item(i, embedding)

annoy_index.build(n_trees=10)  # Build the index with 10 trees for efficiency

# 6. Query for k=20 similar products
def find_similar_products(query_text, k=20):
    query_seq = pad_sequences(tokenizer.texts_to_sequences([preprocess_text(query_text)]), maxlen=max_len, padding='post')
    query_tensor = tf.convert_to_tensor(query_seq, dtype=tf.int32)
    query_emb = np.mean(embedding_layer(query_tensor).numpy(), axis=1)[0]
    similar_indices = annoy_index.get_nns_by_vector(query_emb, k)
    return data.iloc[similar_indices]

# Example query
query = "smartphone with high-resolution camera"
similar_products = find_similar_products(query)

print(similar_products[['product_id', 'product_name', 'category', 'about_product']])


      product_id                                       product_name  \
1382  B07TTSS5MP  lifelong llmg74 750 watt mixer grinder with 3 ...   
1235  B095XCRDQW  esquire laundry basket brown, 50 ltr capacity(...   
1111  B071VNHMX2  philips daily collection hd2582/00 830-watt 2-...   
217   B08PPHFXG3  posh 1.5 meter high speed gold plated hdmi mal...   
904   B00LY1FN1K  camel fabrica acrylic ultra color - 15ml each,...   
1118  B09CGLY5CX  crompton insta comfort heater 2000 watts heat ...   
1322  B08ZHYNTM1  havells festiva 1200mm dust resistant ceiling ...   
55    B01N90RZ4M                          tata sky universal remote   
1372  B07F1T31ZZ  raffles premium stainless steel south indian c...   
431   B0116MIKKC  goldmedal curve plus 202042 plastic spice 3-pi...   
1312  B09VL9KFDB   havells gatik neo 400mm pedestal fan (aqua blue)   
468   B0B244R4KB  spigen ez fit tempered glass screen protector ...   
1453  B07K19NYZ8               usha hc 812 t thermo fan room heater   
447   