Similarity Search


In [9]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os

# Define the path to the alfio_dev folder
alfio_dev_path = "/Users/dadidelux/Desktop/alfio_dev/"

# Construct the path to the CSV file
csv_file_path = os.path.join(alfio_dev_path, "data", "mabuhay_price.csv")
output_file_path = os.path.join(alfio_dev_path, "pkl_output", "mabuhay_price.pkl")


def create_faiss_index(input_csv, output_pkl):
    # Load the CSV file
    df = pd.read_csv(input_csv)

    # Extract titles from the 'mergedata' column
    titles = df["mergedata"].tolist()

    # Load a pre-trained Sentence Transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Encode the titles to get sentence embeddings
    embeddings = model.encode(titles, convert_to_tensor=True)

    # Convert embeddings to numpy array
    embeddings_np = embeddings.cpu().detach().numpy()

    # Create a FAISS index
    dimension = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)

    # Save the FAISS index to a pickle file
    faiss.write_index(index, output_pkl)

    print(f"FAISS index created and saved to {output_pkl}")


def search_similar_titles(query_title, faiss_pkl, top_k=5):
    # Load the FAISS index
    index = faiss.read_index(faiss_pkl)

    # Load the Sentence Transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Encode the query title
    query_embedding = model.encode([query_title], convert_to_tensor=True)
    query_embedding_np = query_embedding.cpu().detach().numpy()

    # Search for similar titles
    distances, indices = index.search(query_embedding_np, top_k)

    return indices[0], distances[0]


if __name__ == "__main__":
    # Example usage
    create_faiss_index(csv_file_path, output_file_path)

    query = "Example title"
    similar_indices, distances = search_similar_titles(query, "titles_index.pkl")
    print(f"Similar titles for '{query}':")
    for i, dist in zip(similar_indices, distances):
        print(f"Index: {i}, Distance: {dist}")

modules.json: 100%|██████████| 349/349 [00:00<00:00, 412kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 479kB/s]
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 21.5MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 314kB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.87MB/s]
pytorch_model.bin:  46%|████▌     | 41.9M/90.9M [02:03<02:08, 382kB/s]Error while downloading from https://cdn-lfs.huggingface.co/sentence-transformers/all-MiniLM-L6-v2/c3a85f238711653950f6a79ece63eb0ea93d76f6a6284be04019c53733baf256?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1709108190&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwOTEwODE5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9zZW50ZW5jZS10cmFuc2Zvcm1lcnMvYWxsLU1pbmlMTS1MNi12Mi9jM2E4NWYyMz

FAISS index created and saved to /Users/dadidelux/Desktop/alfio_dev/pkl_output/mabuhay_price.pkl


RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char *) at /Users/runner/work/faiss-wheels/faiss-wheels/faiss/faiss/impl/io.cpp:68: Error: 'f' failed: could not open titles_index.pkl for reading: No such file or directory

In [10]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os

# Define the path to the alfio_dev folder
alfio_dev_path = "/Users/dadidelux/Desktop/alfio_dev/"

# Construct the path to the CSV file and the output PKL file
csv_file_path = os.path.join(alfio_dev_path, "data", "mabuhay_price.csv")
output_file_path = os.path.join(alfio_dev_path, "pkl_output", "mabuhay_price.pkl")


def create_faiss_index(input_csv, output_pkl):
    # Load the CSV file
    df = pd.read_csv(input_csv)

    # Extract titles from the 'mergedata' column
    titles = df["mergedata"].tolist()

    # Load a pre-trained Sentence Transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Encode the titles to get sentence embeddings
    embeddings = model.encode(titles, convert_to_tensor=True)

    # Convert embeddings to numpy array
    embeddings_np = embeddings.cpu().detach().numpy()

    # Create a FAISS index
    dimension = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)

    # Save the FAISS index to a pickle file
    faiss.write_index(index, output_pkl)

    print(f"FAISS index created and saved to {output_pkl}")


def search_similar_titles(query_title, faiss_pkl, top_k=5):
    # Load the FAISS index
    index = faiss.read_index(faiss_pkl)

    # Load the Sentence Transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Encode the query title
    query_embedding = model.encode([query_title], convert_to_tensor=True)
    query_embedding_np = query_embedding.cpu().detach().numpy()

    # Search for similar titles
    distances, indices = index.search(query_embedding_np, top_k)

    return indices[0], distances[0]


if __name__ == "__main__":
    # Check if the PKL file exists
    if not os.path.exists(output_file_path):
        print("Creating FAISS index...")
        create_faiss_index(csv_file_path, output_file_path)
    else:
        print("Using existing FAISS index.")

    # Example usage
    query = "Example title"
    similar_indices, distances = search_similar_titles(query, output_file_path)
    print(f"Similar titles for '{query}':")
    for i, dist in zip(similar_indices, distances):
        print(f"Index: {i}, Distance: {dist}")

Using existing FAISS index.
Similar titles for 'Example title':
Index: 351, Distance: 1.605818748474121
Index: 314, Distance: 1.6727250814437866
Index: 270, Distance: 1.674799919128418
Index: 257, Distance: 1.6962765455245972
Index: 386, Distance: 1.697189211845398


In [14]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os

# Define the path to the alfio_dev folder
alfio_dev_path = "/Users/dadidelux/Desktop/alfio_dev/"

# Construct the path to the CSV file and the output PKL file
csv_file_path = os.path.join(alfio_dev_path, "data", "mabuhay_price.csv")
output_file_path = os.path.join(alfio_dev_path, "pkl_output", "mabuhay_price.pkl")

# Load the CSV file
df = pd.read_csv(csv_file_path)


def create_faiss_index(dataframe, output_pkl):
    # Extract titles from the 'mergedata' column
    titles = dataframe["mergedata"].tolist()

    # Load a pre-trained Sentence Transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Encode the titles to get sentence embeddings
    embeddings = model.encode(titles, convert_to_tensor=True)

    # Convert embeddings to numpy array
    embeddings_np = embeddings.cpu().detach().numpy()

    # Create a FAISS index
    dimension = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)

    # Save the FAISS index to a pickle file
    faiss.write_index(index, output_pkl)

    print(f"FAISS index created and saved to {output_pkl}")


def search_similar_titles(query_title, dataframe, faiss_pkl, top_k=5):
    # Load the FAISS index
    index = faiss.read_index(faiss_pkl)

    # Load the Sentence Transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Encode the query title
    query_embedding = model.encode([query_title], convert_to_tensor=True)
    query_embedding_np = query_embedding.cpu().detach().numpy()

    # Search for similar titles
    distances, indices = index.search(query_embedding_np, top_k)

    # Convert distances to similarity scores (range 1.0 to 0.0)
    similarities = 1 - distances[0]

    # Get the similar titles and their shipping prices
    similar_titles = dataframe.iloc[indices[0]]["mergedata"]
    shipping_prices = dataframe.iloc[indices[0]]["shippingfee"]

    return similar_titles, shipping_prices, similarities


if __name__ == "__main__":
    # Check if the PKL file exists
    if not os.path.exists(output_file_path):
        print("Creating FAISS index...")
        create_faiss_index(df, output_file_path)
    else:
        print("Using existing FAISS index.")

    # Example usage
    query = input("Please input the type of shipping")
    similar_titles, shipping_prices, distances = search_similar_titles(
        query, df, output_file_path
    )
    print(f"Similar titles for '{query}':")
    for title, price, dist in zip(similar_titles, shipping_prices, distances):
        print(f"Title: {title}, Shipping Price: {price}, Distance: {dist}")

Using existing FAISS index.
Similar titles for 'Frozen Food Medium Box 15kg 2000':
Title: Frozen Food Medium Box 15kg 2000, Shipping Price: 120, Distance: 1.0
Title: Frozen Food Medium Box 9kg 1500, Shipping Price: 100, Distance: 0.9005154967308044
Title: Frozen Food small box 5kg 2000, Shipping Price: 150, Distance: 0.8528404831886292
Title: FROZEN PRODUCTS medium box 8kg 1500, Shipping Price: 80, Distance: 0.7883222699165344
Title: FROZEN PRODUCTS Medium Box 20kg 1500, Shipping Price: 250, Distance: 0.7873414754867554
