In [None]:
#!pip3 install sentence-transformers torch scikit-learn matplotlib pandas jupyterlab_rise adjustText

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pprint import pprint
from typing import List

import helpers

# Word Embeddings

Words are transformed into vectors so they can be embedded into a common vector space.

Let's define some words.

In [None]:
words = [
    "queen",
    "king",
    "prince",
    "princes",
    "man",
    "woman",
    "boy",
    "girl",
    "red",
    "green",
    "blue",
    "palace",
]

In [None]:
from sentence_transformers import SentenceTransformer

def embed(sentences):
    model_name = "all-MiniLM-L6-v2"
    model = SentenceTransformer(
        model_name,
        device=helpers.get_torch_device_name(),  # Optional: if you want to run this on GPU
    )
    return model.encode(sentences)

Each word is transformed into a multidimensional array. All vectors share the same dimensionality, in this 384 dimensions.

In [None]:
import pandas as pd
pd.DataFrame({"Sentence": words, "Encoding": list(embed(words))}).head(4)

Let's visualize the vectors to show their relationships. Since the vectors have a high number of dimensions we use Principal Component Analysis (PCA) to reduce them to two dimensions so that they can be plotted on a computer screen.

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from adjustText import adjust_text


def plot(sentences, embeddings, color="blue", figure=None):
    # Perform PCA
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Create a plot
    plt.figure(num=figure, figsize=(10, 6))
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=color)

    # Create a list to store text objects for adjust_text
    texts = []

    # Loop through each sentence and create an annotation
    for i, sentence in enumerate(sentences):
        words = sentence.split()
        annotation = " ".join(words[:3]) + ("..." if len(words) > 3 else "")
        # Append the text object to the list
        texts.append(
            plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], annotation)
        )

    # Use adjust_text to avoid overlapping
    adjust_text(texts, arrowprops=dict(arrowstyle="->", color="red"))

    # Labeling the axes and the title
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title("2D PCA of Sentence Embeddings")

    # Show the plot
    if figure is None:
        plt.show()

When visualizing the vectors, clear semantic clusters appear.

In [None]:
plot(words, embed(words))

# Sentence Embeddings

We can not only embed words but entire sentences or even documents.

In [None]:
import pandas as pd

documents = [
    "Vector embeddings are mathematical representations of objects, often words or phrases, in a high-dimensional space. By mapping similar objects to proximate points, embeddings capture relationships and semantic meaning. Commonly used in machine learning and natural language processing tasks, methods like Word2Vec, GloVe, and FastText have popularized their application, enabling advancements in text analysis, recommendation systems, and more.",
    "Keyword search refers to the process of locating information in a database, search engine, or other data repository by specifying particular words, phrases, or symbols. In the digital realm, it's foundational to search engines like Google and Bing. The search results are typically ranked based on relevance, which is determined using various algorithms that consider factors like frequency, location, and link structures. Keyword search is integral for navigating the vast expanse of online information, aiding users in retrieving relevant data efficiently.",
    "Sandwiches are a popular type of food consisting of one or more types of food, such as vegetables, sliced meat, or cheese, placed between slices of bread. They can range from simple combinations like peanut butter and jelly to more complex gourmet creations. Originating from England in the 18th century, sandwiches have become a staple in many cultures worldwide, prized for their convenience and versatility. Variations exist based on regional preferences, ingredients, and preparation methods.",
    "Data science is an interdisciplinary field that leverages statistical, computational, and domain-specific expertise to extract insights and knowledge from structured and unstructured data. It encompasses various techniques from statistics, machine learning, data mining, and big data technologies to analyze and interpret complex data. Data science has applications across numerous sectors, including healthcare, finance, marketing, and social sciences, driving decision-making, predictive analytics, and artificial intelligence advancements. Its growing significance in today's data-driven world has led to the rise of specialized tools, methodologies, and educational programs.",
    "Neural networks are a class of machine learning models inspired by the biological neural networks of animal brains. They consist of interconnected layers of nodes, or neurons, which process input data through a series of transformations and connections to produce output. Neural networks are particularly adept at recognizing patterns, making them useful for a wide range of applications such as image and speech recognition, natural language processing, and predictive analytics. The development of deep neural networks, which contain multiple hidden layers, has been central to the field of deep learning and has significantly advanced the capabilities of artificial intelligence systems.",
    "Pasta is a staple food of traditional Italian cuisine, with the first reference dating to 1154 in Sicily. It is typically made from an unleavened dough of durum wheat flour mixed with water or eggs and formed into sheets or various shapes, then cooked by boiling or baking. Pasta is versatile and can be served with a variety of sauces, meats, and vegetables. It is categorized in two basic styles: dried and fresh. Popular around the world, pasta dishes are central to many diets and come in numerous shapes like spaghetti, penne, and ravioli.",
    "Soup is a liquid food, generally served warm or hot (but also cold), that is made by combining ingredients such as meat and vegetables with stock, juice, water, or another liquid. Soups are inherently diverse, ranging from rich, cream-based varieties to brothy and vegetable-laden concoctions. They are often regarded as comfort food and can be served as a main dish or as an appetizer, with regional and cultural variations like the Spanish gazpacho, Japanese miso soup, and Russian borscht.",
    "A casserole is a comprehensive one-dish meal baked in a deep, ovenproof dish with a glass or ceramic base. It typically includes a combination of meats, vegetables, starches like rice or potatoes, and a binding agent like a soup or sauce. Topped with cheese or breadcrumbs for a crispy crust, casseroles are appreciated for their convenience and the ability to meld flavors during the baking process. They are a fixture in many cultures and are particularly beloved as home-cooked comfort foods, often featuring in communal gatherings and family dinners.",
]

pd.DataFrame({"Sentence": documents, "Encoding": list(embed(documents))}).head(3)

Again, semantic clusters appear when visualizing the vectors in a 2D-space.

In [None]:
def plots(sentences_embeddings_color):
    figure = plt.figure(figsize=(10, 6))
    for sentences, embeddings, color in sentences_embeddings_color:
        plot(sentences, embeddings, color, figure)
    plt.show()

In [None]:
plots([(documents, embed(documents), "green")])

The vector representation can also be used for document retrieval by finding the nearest documents. Let's start by defining some search queries.

In [None]:
queries = [
    "information retrieval",
    "machine learning",
    "cooking",
]
plots([(queries, embed(queries), "red")])

Now we visualize the documents and the queries in one space. Just by looking at the visualization, and the distances between documents and queries, we can already which documents are a good match for each query.

In [None]:
plots([(documents, embed(documents), "green"), (queries, embed(queries), "red")])

# Simply Search

## Load Data
Firstly, we need to load data. To do this, we use the product data from a [customer](https://gympluscoffee.com/).

In [None]:
merchant_id = "shopify-20345599"
path_name = f"data/{merchant_id}_products.pkl"
(df_products := pd.read_pickle(path_name)).head(3)

## Embed Data
The next step is to embed the data.
In a real-world scenario, we use specialised programs such as ElasticSearch to apply embeddings and assign weights to different fields.
The advantage of using embeddings is that different fields can be combined in advance to achieve a favourable result.

In [None]:
model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device=helpers.get_torch_device_name(),  # Optional: if you want to run this on GPU
)
combine_fields = (
    lambda x: f"Product name = {x['name']}\n"
    f"Description = {x['description']}\n"
    f"Categories = {x['category']}\n"
    f"Brand = {x['brand']}"
)
df_for_search = pd.DataFrame()
df_for_search["base_string"] = df_products.apply(combine_fields, axis=1).values
df_for_search["embeddings"] = list(
    model.encode(
        df_for_search.loc[:, "base_string"].values,
    )
)
df_for_search["name"] = df_products["name"]
df_for_search["productId"] = df_products["productId"]
df_for_search["brand"] = df_products['brand']
df_for_search.head(3)

In [None]:

helpers.color_embedings_df(
    df=df_for_search,
    color_col="brand",
    dimensions=2,
    hover_data=["name"],
)

## Calculate Similarity
The most frequently employed method for assessing similarity is through [cosine_similarity](https://en.wikipedia.org/wiki/Cosine_similarity) or cosine distance.
  
![cosinus_similarity](https://wikimedia.org/api/rest_v1/media/math/render/svg/15d11df2d48da4787ee86a4b8c14551fbf0bc96a)
  
We can leverage the pre-existing functionality provided by sklearn for this purpose.

#ToDo plot cosine similarity

### define query and get top n results
#ToDo mabye delete

In [None]:
from IPython.display import display, HTML

query_easy = "Hoodie"
top_n = 5
from sklearn.metrics.pairwise import cosine_similarity


def get_similar_products_sklearn(
    df: pd.DataFrame, query: str, top_n: int = -1
) -> pd.DataFrame:
    query_embedding = model.encode(query)
    cos_simi = cosine_similarity([query_embedding], df["embeddings"].tolist())[0]
    results = df.copy(deep=True)
    results["dis_cos"] = 1 - cos_simi
    results.sort_values("dis_cos", inplace=True)
    return results.iloc[0:top_n, :]


sim_prod = get_similar_products_sklearn(df_for_search, query_easy, top_n)

helpers.display_images_and_names(
    sim_prod, merchant_id=merchant_id, header_text=f"VectorSearch for:<br>'{query_easy}'"
)

### more complex query

In [None]:
query_complex = "I Need a new hody for my Frau. It soll be green."
sim_prod = get_similar_products_sklearn(df_for_search, query_complex, top_n)
helpers.display_images_and_names(
    sim_prod, merchant_id=merchant_id, header_text=f"VectorSearch for:<br>'{query_complex}'"
)

## Approximate calculation of similarity
With [ANNOY](https://github.com/spotify/annoy) (Approximate Nearest Neighbors Oh Yeah) we can significantly increase the efficiency of our search processes. To achieve this, we create an index that is not only very powerful, but also compact.

In [None]:
from annoy import AnnoyIndex

def get_annoy_index(df: pd.DataFrame, n_trees: int = 100) -> AnnoyIndex:
    embeddings = df["embeddings"]
    index_ann = AnnoyIndex(
        len(embeddings[0]), "angular"
    )  # Length of item vector that will be indexed
    for i, v in embeddings.items():  # ATTENTION index must be int
        index_ann.add_item(i, v)
    index_ann.build(n_trees=n_trees)  # More trees gives higher precision when querying
    return index_ann

ann_index: AnnoyIndex = get_annoy_index(df_for_search, n_trees=20)

#### Show entries of annoy index

In [None]:
n_items = ann_index.get_n_items()
pd.DataFrame(
    [ann_index.get_item_vector(x) for x in range(n_items)],
).head(3)

In [None]:
df_for_search["annoy_cluster"] = helpers.calc_cluster(ann_index)

helpers.color_embedings_df(df_for_search, "annoy_cluster", hover_data=["name"])

### Query with Annoy

In [None]:
def get_similar_products_annoy(
    ann_index: AnnoyIndex, query: str, top_n: int = -1
) -> List[int]:
    query_embedding = model.encode(query)
    nns = ann_index.get_nns_by_vector(query_embedding, top_n)
    return nns

sim_prod = df_products.loc[get_similar_products_annoy(ann_index, query_easy, top_n), :]
helpers.display_images_and_names(
    sim_prod, merchant_id=merchant_id, header_text=f"ANNOY VectorSearch for \n'{query_easy}'"
)

### Complex Query with Annoy

In [None]:
sim_prod = df_products.loc[
    get_similar_products_annoy(ann_index, query_complex, top_n), :
]
helpers.display_images_and_names(
    sim_prod, merchant_id=merchant_id, header_text=f"ANNOY VectorSearch for \n'{query_complex}'"
)

PRo and Cons

### time comparison
Only if it enough time
![](data/time_to_calculate_similarity_100_queries_all-MiniLM-L6-v2.png)