In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load a pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to generate embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.numpy()

# Example data
texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Generate embeddings for the texts
embeddings = get_embeddings(texts)

In [2]:
embeddings.shape

(4, 384)

In [3]:
import faiss

# Create a Faiss index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(embeddings)

# Optionally, you can save the index to disk for later use
faiss.write_index(index, "faiss_index.bin")

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document

# Custom embedding class
class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return get_embeddings(texts).tolist()

    def embed_query(self, text):
        return get_embeddings([text]).tolist()[0]

# Create a list of Documents
documents = [Document(page_content=text) for text in texts]

# Create the FAISS vector store
faiss_store = FAISS.from_documents(documents, CustomEmbeddings())

# Save the FAISS index
faiss_store.save_local("faiss_index")

# Load the FAISS index with dangerous deserialization allowed
loaded_faiss_store = FAISS.load_local("faiss_index", CustomEmbeddings(), allow_dangerous_deserialization=True)

# Perform a similarity search
query = "Is this the first document?"
docs = loaded_faiss_store.similarity_search(query, k=2)

print("Top 2 most similar documents:")
for doc in docs:
    print(doc.page_content)


In [2]:
import datasets # hugging face datasets
datasets.utils.logging.disable_progress_bar() 

In [3]:
# import dataset to hugging face dataset
ds = datasets.load_dataset("xiyuez/red-dot-design-award-product-description") 
print(ds)

DatasetDict({
    train: Dataset({
        features: ['product', 'category', 'description', 'text'],
        num_rows: 21183
    })
})


In [5]:
import datasets
import pandas as pd

# Load the dataset
ds = datasets.load_dataset("xiyuez/red-dot-design-award-product-description")

# Print the dataset structure to inspect the available splits
print(ds)

# Convert the available splits to pandas DataFrames
df_dict = {split: pd.DataFrame(ds[split]) for split in ds.keys()}

# Print the DataFrames
for split, df in df_dict.items():
    print(f"{split} split DataFrame:")
    print(df.head(), "\n")


DatasetDict({
    train: Dataset({
        features: ['product', 'category', 'description', 'text'],
        num_rows: 21183
    })
})
train split DataFrame:
                                           product  \
0                              Biamp Rack Products   
1                                              V33   
2  HP LaserJet 5000-6000 and E700-E800 Series MFPs   
3                 Meaco Arete One 20L Dehumidifier   
4       théATRE Glass Container for Loose Leaf Tea   

                                  category  \
0                 Digital Audio Processors   
1                             Video Camera   
2                  Multi-Function Printers   
3  Heating and Air Conditioning Technology   
4                          Food Containers   

                                         description  \
0  “High recognition value, uniform aesthetics an...   
1  The V33 livestreaming video camera ensures hig...   
2  The HP LaserJet 5000 to 6000 Series and E700 t...   
3  The Meaco Are

In [7]:
df.shape

(21183, 4)

In [8]:
import datasets
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
ds = datasets.load_dataset("xiyuez/red-dot-design-award-product-description")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(ds['train'])

# Split the DataFrame into train and test sets with a 70/30 ratio
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Convert the pandas DataFrames back to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Print the dataset splits
print("Train dataset:")
print(train_dataset)
print("\nTest dataset:")
print(test_dataset)


Train dataset:
Dataset({
    features: ['product', 'category', 'description', 'text', '__index_level_0__'],
    num_rows: 14828
})

Test dataset:
Dataset({
    features: ['product', 'category', 'description', 'text', '__index_level_0__'],
    num_rows: 6355
})


In [16]:
import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
ds = datasets.load_dataset("xiyuez/red-dot-design-award-product-description")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(ds['train'])

# Take 20% of the whole data
df_sampled = df.sample(frac=0.2, random_state=42)

# Split the sampled DataFrame into train and test sets with a 70/30 ratio
train_df, test_df = train_test_split(df_sampled, test_size=0.3, random_state=42)

# Reset the index for both train and test DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Print the DataFrames
print("Sampled Train DataFrame:")
print(train_df.head())
print("\nSampled Test DataFrame:")
print(test_df.head())


Sampled Train DataFrame:
                   product              category  \
0         Harman Kardon BT  Bluetooth Headphones   
1               ME02B Plus     Bluetooth Earbuds   
2                IROAD X10               Dashcam   
3                      Zoo                 Chair   
4  GHOST AMR RIOT Lector 9         Mountain Bike   

                                         description  \
0  The new BT headphones offer enhanced wearing c...   
1  The intelligent Bluetooth neckband earbuds ME0...   
2  The IROAD X10 is designed so that it does not ...   
3  The Zoo collection is suited for large confere...   
4  This bike combines detailed innovative solutio...   

                                                text  
0  Product Name: Harman Kardon BT;\n\nProduct Cat...  
1  Product Name: ME02B Plus;\n\nProduct Category:...  
2  Product Name: IROAD X10;\n\nProduct Category: ...  
3  Product Name: Zoo;\n\nProduct Category: Chair;...  
4  Product Name: GHOST AMR RIOT Lector 9;\n\nProd.

In [17]:
train_df.head()

Unnamed: 0,product,category,description,text
0,Harman Kardon BT,Bluetooth Headphones,The new BT headphones offer enhanced wearing c...,Product Name: Harman Kardon BT;\n\nProduct Cat...
1,ME02B Plus,Bluetooth Earbuds,The intelligent Bluetooth neckband earbuds ME0...,Product Name: ME02B Plus;\n\nProduct Category:...
2,IROAD X10,Dashcam,The IROAD X10 is designed so that it does not ...,Product Name: IROAD X10;\n\nProduct Category: ...
3,Zoo,Chair,The Zoo collection is suited for large confere...,Product Name: Zoo;\n\nProduct Category: Chair;...
4,GHOST AMR RIOT Lector 9,Mountain Bike,This bike combines detailed innovative solutio...,Product Name: GHOST AMR RIOT Lector 9;\n\nProd...


In [18]:
# Print the first row of the 'text' column for both DataFrames
print("First row of the 'text' column in the sampled Train DataFrame:")
print(train_df.loc[0, 'text'])

print("\nFirst row of the 'text' column in the sampled Test DataFrame:")
print(test_df.loc[0, 'text'])

First row of the 'text' column in the sampled Train DataFrame:
Product Name: Harman Kardon BT;

Product Category: Bluetooth Headphones;

Product Description: The new BT headphones offer enhanced wearing comfort thanks to soft ear cups that adapt to the anatomy of the listener’s ear. They use Bluetooth for wireless audio transmission and also feature an integrated microphone. The side-mounted control buttons allow phone calls to be taken at any time. The rechargeable batteries power the headphones for up to 40 hours and can be easily recharged via a USB port.

First row of the 'text' column in the sampled Test DataFrame:
Product Name: GMX7 X1-PRO;

Product Category: Aquatic Resistance Training System;

Product Description: The X1-PRO is an aquatic resistance training system for swimmers of all skill-levels. A compact, anodised mechanism creates a bidirectional, variable resistance, its range spanning from free movability to immobility. It can be used up to distances of 50 metres and als

In [19]:
# Print the first row of the 'text' column for both DataFrames
print("First row of the 'description' column in the sampled Train DataFrame:")
print(train_df.loc[0, 'description'])

print("\nFirst row of the 'description' column in the sampled Test DataFrame:")
print(test_df.loc[0, 'description'])

First row of the 'description' column in the sampled Train DataFrame:
The new BT headphones offer enhanced wearing comfort thanks to soft ear cups that adapt to the anatomy of the listener’s ear. They use Bluetooth for wireless audio transmission and also feature an integrated microphone. The side-mounted control buttons allow phone calls to be taken at any time. The rechargeable batteries power the headphones for up to 40 hours and can be easily recharged via a USB port.

First row of the 'description' column in the sampled Test DataFrame:
The X1-PRO is an aquatic resistance training system for swimmers of all skill-levels. A compact, anodised mechanism creates a bidirectional, variable resistance, its range spanning from free movability to immobility. It can be used up to distances of 50 metres and also offers stroke correction to deliver maximum training results. Performance is greatly enhanced due to its streamlined functionality and compact size, which also improves handling and p

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings

In [23]:
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", 
                                           model_kwargs={'device': 'cpu'})



In [25]:
def embed_text(text):
    return embedding_function.embed_query(text)

# Apply the embedding function to the 'text' column
train_df['embeddings'] = train_df['text'].apply(embed_text)

In [26]:
train_df.head()

Unnamed: 0,product,category,description,text,embeddings
0,Harman Kardon BT,Bluetooth Headphones,The new BT headphones offer enhanced wearing c...,Product Name: Harman Kardon BT;\n\nProduct Cat...,"[-0.06693495810031891, 0.05769552290439606, 0...."
1,ME02B Plus,Bluetooth Earbuds,The intelligent Bluetooth neckband earbuds ME0...,Product Name: ME02B Plus;\n\nProduct Category:...,"[-0.04429144784808159, 0.02171323075890541, 0...."
2,IROAD X10,Dashcam,The IROAD X10 is designed so that it does not ...,Product Name: IROAD X10;\n\nProduct Category: ...,"[-0.05515052005648613, 0.04215111583471298, -0..."
3,Zoo,Chair,The Zoo collection is suited for large confere...,Product Name: Zoo;\n\nProduct Category: Chair;...,"[0.036150287836790085, 0.04856966808438301, -0..."
4,GHOST AMR RIOT Lector 9,Mountain Bike,This bike combines detailed innovative solutio...,Product Name: GHOST AMR RIOT Lector 9;\n\nProd...,"[-0.08179118484258652, 0.10800202935934067, 0...."


In [27]:
test_df['embeddings'] = train_df['text'].apply(embed_text)

In [28]:
from langchain.vectorstores import FAISS
import numpy as np

In [29]:
# Prepare the embeddings and texts for FAISS
train_embeddings = np.vstack(train_df['embeddings'].values)
train_texts = train_df['text'].values

In [33]:
# Create the FAISS index
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
import numpy as np
# Create documents for the FAISS index
documents = [Document(page_content=text) for text in train_texts]

In [36]:
# Create an InMemoryDocstore
# Create an InMemoryDocstore
docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})

In [37]:
# Create an index_to_docstore_id mapping
index_to_docstore_id = {i: str(i) for i in range(len(documents))}

In [45]:
# Create the FAISS index
import faiss
dimension = train_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(train_embeddings)

In [47]:
# Wrap the FAISS index with LangChain's FAISS class
vectorstore = FAISS(embedding_function.embed_query, faiss_index, docstore, index_to_docstore_id)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [48]:
# Define a function for similarity search
def similarity_search(query, k=5):
    query_embedding = np.array(embed_text(query)).reshape(1, -1)
    results = vectorstore.similarity_search_by_vector(query_embedding[0], k=k)
    return results

In [49]:
# Example similarity search
query = "innovative product design"
results = similarity_search(query, k=5)

# Print the results
print("Similarity search results for query:", query)
for result in results:
    print(f"Text: {result.page_content}")

Similarity search results for query: innovative product design
Text: Product Name: Fresh Sense of Humanity;

Product Category: Private Living Space;

Product Description: The concept of this interior design sets itself the challenge of communicating unmistakable personal taste and rich cultural literacy. The design takes Nordic style as its inspiration. The overall impression is dominated by clean lines, the use of much light wood and a colour concept based on harmonious natural tones. An arresting eye-catcher in the living area is the TV wall which is made up of different, carefully colour-coordinated panels that are reminiscent of slightly weathered wood.
Text: Product Name: Design Inspiration Kit;

Product Category: Sample Set;

Product Description: The Design Inspiration Kit demonstrates the technical capabilities of the manufacturer’s proprietary 3D printer. The central design element resembles a stone whose shape makes it pleasant to hold and which served as the starting point fo

In [1]:
import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
import numpy as np
import faiss
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ProductDescriptionSimilaritySearch:
    def __init__(self, dataset_name, sample_frac=0.2, train_test_ratio=0.7, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
        self.dataset_name = dataset_name
        self.sample_frac = sample_frac
        self.train_test_ratio = train_test_ratio
        self.embedding_model = embedding_model
        self.embedding_function = HuggingFaceEmbeddings(model_name=self.embedding_model, model_kwargs={'device': 'cpu'})
        self.train_df = None
        self.test_df = None
        self.vectorstore = None

    def load_dataset(self):
        logger.info("Loading dataset...")
        ds = datasets.load_dataset(self.dataset_name)
        df = pd.DataFrame(ds['train'])
        df_sampled = df.sample(frac=self.sample_frac, random_state=42)
        train_df, test_df = train_test_split(df_sampled, test_size=1 - self.train_test_ratio, random_state=42)
        self.train_df = train_df.reset_index(drop=True)
        self.test_df = test_df.reset_index(drop=True)
        logger.info("Dataset loaded and split into train and test sets.")

    def embed_text(self, text):
        return self.embedding_function.embed_query(text)

    def create_embeddings(self):
        logger.info("Creating embeddings for the text column...")
        self.train_df['embeddings'] = self.train_df['text'].apply(self.embed_text)
        self.test_df['embeddings'] = self.test_df['text'].apply(self.embed_text)
        logger.info("Embeddings created.")

    def setup_faiss_index(self):
        logger.info("Setting up FAISS index...")
        train_embeddings = np.vstack(self.train_df['embeddings'].values)
        train_texts = self.train_df['text'].values
        documents = [Document(page_content=text) for text in train_texts]
        docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})
        index_to_docstore_id = {i: str(i) for i in range(len(documents))}
        dimension = train_embeddings.shape[1]
        faiss_index = faiss.IndexFlatL2(dimension)
        faiss_index.add(train_embeddings)
        self.vectorstore = FAISS(self.embedding_function.embed_query, faiss_index, docstore, index_to_docstore_id)
        logger.info("FAISS index set up.")

    def similarity_search(self, query, k=5):
        logger.info(f"Performing similarity search for query: {query}")
        query_embedding = np.array(self.embed_text(query)).reshape(1, -1)
        results = self.vectorstore.similarity_search_by_vector(query_embedding[0], k=k)
        return results

# Example usage
if __name__ == "__main__":
    searcher = ProductDescriptionSimilaritySearch(dataset_name="xiyuez/red-dot-design-award-product-description")
    searcher.load_dataset()
    searcher.create_embeddings()
    searcher.setup_faiss_index()
    
    query = "innovative product design"
    results = searcher.similarity_search(query, k=5)
    
    logger.info("Similarity search results:")
    for result in results:
        logger.info(f"Text: {result.page_content}")


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Loading dataset...
INFO:__main__:Dataset loaded and split into train and test sets.
INFO:__main__:Creating embeddings for the text column...
INFO:__main__:Embeddings created.
INFO:__main__:Setting up FAISS index...
INFO:__main__:FAISS index set up.
INFO:__main__:Performing similarity search for query: innovative product design
INFO:__main__:Similarity search results:
INFO:__main__:Text: Product Name: Fresh Sense of Humanity;

Product Category: Private Living Space;

Product Description: The concept of this interior design sets itself the challenge of communicating unmistakable personal taste and rich cultural literacy. The design takes Nordic style as its inspiration. The overall impression is dominated by clean lines, the use of much light wood and a colour concept based on harmonious natural tones. An arresting eye-catcher in the living area is the 