In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
user_text = "Interested in crypto, passive income, and wealth growth. Feeling optimistic about long-term investments."
items_df = pd.read_csv("user_data/products.csv")  # or merge products + content

In [3]:
full_texts = items_df["title"] + ": " + items_df["description"]
print("what all texts do we have - ")
print(full_texts)
print(type(full_texts))
print(full_texts.shape)


what all texts do we have - 
0    Premium Travel Credit Card: Credit card with a...
1    Basic Cashback Card: Credit card with 5% cashb...
2    Robo-Advisory Portfolio: AI-based investment a...
3    HNI Wealth Management: Dedicated relationship ...
4    Crypto Investment Assistant: Tools and tips fo...
5    Senior Citizen FD Plan: Fixed deposit plan wit...
6    Startup Business Loan: Flexible collateral-fre...
7    Gold Loan: Quick loan against gold with low in...
8    Student Education Loan: Low-interest education...
9    Luxury Lifestyle Credit Card: Invite-only cred...
dtype: object
<class 'pandas.core.series.Series'>
(10,)


In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
item_list = full_texts.tolist()
# print("item_list - ")
# print(item_list)
# print(type(item_list))
# print(len(item_list))
user_embedding = model.encode([user_text])
print("user_embedding - ")
print(user_embedding)
print(type(user_embedding))
print(len(user_embedding))

  

user_embedding - 
[[ 6.99986741e-02 -7.21919015e-02 -3.70294601e-02  6.41100928e-02
   2.07584724e-02 -3.68936360e-02  2.82651577e-02  2.85966378e-02
  -3.80057171e-02  3.60609870e-03 -3.47582549e-02  2.37385146e-02
   3.51367984e-03 -1.70040093e-02  1.52261439e-03 -2.74121631e-02
  -3.46212350e-02 -7.63243139e-02  2.32359022e-03  1.22945267e-03
  -3.41229290e-02 -9.34793800e-02 -2.66011013e-03 -4.17934954e-02
   8.10140595e-02 -8.87544453e-03  3.69502380e-02 -4.91283741e-03
  -4.58917320e-02 -6.32423256e-03 -2.09200516e-04  3.83826159e-02
   8.43814239e-02  7.04247598e-03 -1.99295431e-02  6.61532134e-02
   2.86933295e-02  7.10645365e-03  2.50050481e-02 -7.58625865e-02
   2.15780325e-02 -1.04237430e-01  7.24957231e-03 -5.58718815e-02
  -1.50146754e-02 -8.33128467e-02  4.77273948e-02  1.54195516e-03
   1.48805673e-03 -5.15261665e-02 -4.19148095e-02 -1.08789289e-02
   2.35588811e-02  8.04358646e-02 -8.26791003e-02  7.00417068e-03
  -4.35795449e-02 -2.89481711e-02  2.93540265e-02 -8.94043

In [6]:
user_embedding_curr = user_embedding[0]
print("user_embedding_curr - ")
print(user_embedding_curr)
print(type(user_embedding_curr))
print(len(user_embedding_curr))

user_embedding_curr - 
[ 6.99986741e-02 -7.21919015e-02 -3.70294601e-02  6.41100928e-02
  2.07584724e-02 -3.68936360e-02  2.82651577e-02  2.85966378e-02
 -3.80057171e-02  3.60609870e-03 -3.47582549e-02  2.37385146e-02
  3.51367984e-03 -1.70040093e-02  1.52261439e-03 -2.74121631e-02
 -3.46212350e-02 -7.63243139e-02  2.32359022e-03  1.22945267e-03
 -3.41229290e-02 -9.34793800e-02 -2.66011013e-03 -4.17934954e-02
  8.10140595e-02 -8.87544453e-03  3.69502380e-02 -4.91283741e-03
 -4.58917320e-02 -6.32423256e-03 -2.09200516e-04  3.83826159e-02
  8.43814239e-02  7.04247598e-03 -1.99295431e-02  6.61532134e-02
  2.86933295e-02  7.10645365e-03  2.50050481e-02 -7.58625865e-02
  2.15780325e-02 -1.04237430e-01  7.24957231e-03 -5.58718815e-02
 -1.50146754e-02 -8.33128467e-02  4.77273948e-02  1.54195516e-03
  1.48805673e-03 -5.15261665e-02 -4.19148095e-02 -1.08789289e-02
  2.35588811e-02  8.04358646e-02 -8.26791003e-02  7.00417068e-03
 -4.35795449e-02 -2.89481711e-02  2.93540265e-02 -8.94043669e-02
  

In [7]:
item_embeddings = model.encode(item_list)
print("item_embeddings - ")
print(item_embeddings)
print(type(item_embeddings))  
print(len(item_embeddings))
print(len(item_embeddings[0]))

item_embeddings - 
[[ 0.07280532  0.06593621 -0.03189581 ... -0.04568769  0.00407486
   0.00138527]
 [-0.04366967  0.05792338 -0.03763994 ...  0.02010446  0.0140496
  -0.01410989]
 [ 0.02041199 -0.05910041 -0.07165001 ... -0.1014402  -0.06723908
   0.04628965]
 ...
 [ 0.00789054  0.04559878  0.00719975 ... -0.0789618  -0.05096518
  -0.04313812]
 [ 0.01271158  0.08958375  0.02617336 ... -0.05110588 -0.10428416
  -0.02923345]
 [-0.0136659   0.10419497 -0.01316837 ... -0.05099129 -0.05288475
   0.00391587]]
<class 'numpy.ndarray'>
10
384


In [8]:
# Cosine similarity
start_cosine = time.time()
cosine_sim = cosine_similarity(user_embedding, item_embeddings)
print("cosine_sim - ")
print(cosine_sim)
print(type(cosine_sim))
print(len(cosine_sim))
print(len(cosine_sim[0]))

cosine_sim - 
[[0.24890749 0.17858744 0.4240221  0.35233927 0.6448125  0.29076505
  0.25948638 0.2585975  0.26541302 0.24607527]]
<class 'numpy.ndarray'>
1
10


In [9]:
cosine_sim_final = cosine_sim[0]
print("cosine_sim_final - ")    
print(cosine_sim_final)
print(type(cosine_sim_final))
print(len(cosine_sim_final))

cosine_sim_final - 
[0.24890749 0.17858744 0.4240221  0.35233927 0.6448125  0.29076505
 0.25948638 0.2585975  0.26541302 0.24607527]
<class 'numpy.ndarray'>
10


In [10]:
cosine_sim_sorted_desc = np.argsort(cosine_sim_final)[::-1]
print("cosine_sim_sorted_desc - ")
print(cosine_sim_sorted_desc)
print(type(cosine_sim_sorted_desc))
print(len(cosine_sim_sorted_desc))

cosine_sim_sorted_desc - 
[4 2 3 5 8 6 7 0 9 1]
<class 'numpy.ndarray'>
10


In [11]:
cos_top_idx = cosine_sim_sorted_desc[:3]
print("the top 3 indices - ", cos_top_idx)

the top 3 indices -  [4 2 3]


In [12]:
print(items_df.iloc[cos_top_idx])

  product_id                        title  \
4       P005  Crypto Investment Assistant   
2       P003      Robo-Advisory Portfolio   
3       P004        HNI Wealth Management   

                                         description  \
4      Tools and tips for high-risk crypto investors   
2  AI-based investment advisor with risk-managed ...   
3  Dedicated relationship manager for high net wo...   

                                 tags     type  
4           crypto, investment, risky  product  
2  investment, mutual fund, automated  product  
3              wealth, HNI, exclusive  product  


In [13]:
cosine_results = items_df.iloc[cos_top_idx].copy()
print("cosine results - ", cosine_results)

cosine results -    product_id                        title  \
4       P005  Crypto Investment Assistant   
2       P003      Robo-Advisory Portfolio   
3       P004        HNI Wealth Management   

                                         description  \
4      Tools and tips for high-risk crypto investors   
2  AI-based investment advisor with risk-managed ...   
3  Dedicated relationship manager for high net wo...   

                                 tags     type  
4           crypto, investment, risky  product  
2  investment, mutual fund, automated  product  
3              wealth, HNI, exclusive  product  


In [14]:
print("the chosen indices - ", cos_top_idx)
cosine_results["score"] = cosine_sim_final[cos_top_idx]
print("cosine results with score - ")
print(cosine_results)
print("columns in cosine results with score -", cosine_results.columns)

the chosen indices -  [4 2 3]
cosine results with score - 
  product_id                        title  \
4       P005  Crypto Investment Assistant   
2       P003      Robo-Advisory Portfolio   
3       P004        HNI Wealth Management   

                                         description  \
4      Tools and tips for high-risk crypto investors   
2  AI-based investment advisor with risk-managed ...   
3  Dedicated relationship manager for high net wo...   

                                 tags     type     score  
4           crypto, investment, risky  product  0.644813  
2  investment, mutual fund, automated  product  0.424022  
3              wealth, HNI, exclusive  product  0.352339  
columns in cosine results with score - Index(['product_id', 'title', 'description', 'tags', 'type', 'score'], dtype='object')


In [15]:
cosine_results["method"] = "Cosine Similarity"
cosine_time = time.time() - start_cosine
print("final cosine results - ", cosine_results)
print("time taken for cosine similarity - ", cosine_time)

final cosine results -    product_id                        title  \
4       P005  Crypto Investment Assistant   
2       P003      Robo-Advisory Portfolio   
3       P004        HNI Wealth Management   

                                         description  \
4      Tools and tips for high-risk crypto investors   
2  AI-based investment advisor with risk-managed ...   
3  Dedicated relationship manager for high net wo...   

                                 tags     type     score             method  
4           crypto, investment, risky  product  0.644813  Cosine Similarity  
2  investment, mutual fund, automated  product  0.424022  Cosine Similarity  
3              wealth, HNI, exclusive  product  0.352339  Cosine Similarity  
time taken for cosine similarity -  13.80237889289856


In [16]:
# FAISS
start_faiss = time.time()
print("item_embeddings - ")
print(item_embeddings)
print(type(item_embeddings))
print(len(item_embeddings))
print(len(item_embeddings[0]))
d = item_embeddings.shape[1]
print("d here in faiss - ", d)

item_embeddings - 
[[ 0.07280532  0.06593621 -0.03189581 ... -0.04568769  0.00407486
   0.00138527]
 [-0.04366967  0.05792338 -0.03763994 ...  0.02010446  0.0140496
  -0.01410989]
 [ 0.02041199 -0.05910041 -0.07165001 ... -0.1014402  -0.06723908
   0.04628965]
 ...
 [ 0.00789054  0.04559878  0.00719975 ... -0.0789618  -0.05096518
  -0.04313812]
 [ 0.01271158  0.08958375  0.02617336 ... -0.05110588 -0.10428416
  -0.02923345]
 [-0.0136659   0.10419497 -0.01316837 ... -0.05099129 -0.05288475
   0.00391587]]
<class 'numpy.ndarray'>
10
384
d here in faiss -  384


In [17]:
index = faiss.IndexFlatL2(d)
print(index)

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x1056016f0> >


In [18]:
index.add(np.array(item_embeddings).astype("float32"))
print("index - ", index)

index -  <faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x1056016f0> >


In [19]:
print(user_embedding.shape)
print(type(user_embedding))
user_encoded_embedding = np.array(user_embedding).astype('float32')
# print("user_encoded_embedding - ", user_encoded_embedding)
print(user_encoded_embedding.shape)
print(type(user_encoded_embedding))

(1, 384)
<class 'numpy.ndarray'>
(1, 384)
<class 'numpy.ndarray'>


In [20]:
print(index.search(user_encoded_embedding, 3))

(array([[0.710375 , 1.1519558, 1.2953216]], dtype=float32), array([[4, 2, 3]]))


In [21]:
_, faiss_top_idx = index.search(user_encoded_embedding, 3)
print("faiss_top_idx - ", faiss_top_idx)

faiss_top_idx -  [[4 2 3]]


In [None]:
faiss_top_idx = faiss_top_idx[0]
print("faiss_top_idx - ")
print(faiss_top_idx)
faiss_results = items_df.iloc[faiss_top_idx].copy()
faiss_results["score"] = "N/A"
faiss_results["method"] = "FAISS (L2 Distance)"
print(" faiss results - ")
print(faiss_results)
print("columns in faiss results - ", faiss_results.columns)
print("final faiss results - ", faiss_results)

# Combine
combined_df = pd.concat([cosine_results, faiss_results], ignore_index=True)

# return combined_df, {
# "cosine_similarity_time": round(cosine_time, 3),
# "faiss_search_time": round(faiss_time, 3)
# }


faiss_top_idx - 
[4 2 3]
 faiss results - 
  product_id                        title  \
4       P005  Crypto Investment Assistant   
2       P003      Robo-Advisory Portfolio   
3       P004        HNI Wealth Management   

                                         description  \
4      Tools and tips for high-risk crypto investors   
2  AI-based investment advisor with risk-managed ...   
3  Dedicated relationship manager for high net wo...   

                                 tags     type score               method  
4           crypto, investment, risky  product   N/A  FAISS (L2 Distance)  
2  investment, mutual fund, automated  product   N/A  FAISS (L2 Distance)  
3              wealth, HNI, exclusive  product   N/A  FAISS (L2 Distance)  
columns in faiss results -  Index(['product_id', 'title', 'description', 'tags', 'type', 'score',
       'method'],
      dtype='object')
final faiss results -    product_id                        title  \
4       P005  Crypto Investment Assistant

NameError: name 'faiss_time' is not defined