In [2]:
import os

import openai
from dotenv import load_dotenv

load_dotenv()

True

# List of input sentences

In [3]:
sentences = [
    'What did the King wear?',
    'The king wore black clothes',
    'What is a red panda?',
    'The red panda is a small mammal.',
]

# Generate embeddings

In [4]:
# get embeddings for each sentence using openai embedding model
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
embedding_model = 'text-embedding-3-large'

openai_embeddings = []
for sentence in sentences:
    response = client.embeddings.create(
        model=embedding_model,
        input=sentence
    )
    embedding = response.data[0].embedding
    openai_embeddings.append(embedding)
    print(f'Received embedding for sentence: {sentence}')

Received embedding for sentence: What did the King wear?
Received embedding for sentence: The king wore black clothes
Received embedding for sentence: What is a red panda?
Received embedding for sentence: The red panda is a small mammal.


In [5]:
if len(openai_embeddings) == len(sentences) and len(openai_embeddings[0]) != 0:
    print(f'Received embeddings for {len(openai_embeddings)} sentences, embedding size: {len(openai_embeddings[0])}')

Received embeddings for 4 sentences, embedding size: 3072


## Verify unit length

In [29]:
import numpy as np

# converted to float32, as float64 might add a tiny bit of error at the end
np.linalg.norm(openai_embeddings[0]).astype(np.float32)

np.float32(1.0)

# Get cosine similarity between every pair

In [6]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, cosine_similarity

metric = cosine_similarity
metric_name = "cosine_similarity"

## Function to calculate similarities

In [7]:
def calculate_similarities(openai_embeddings_list: list) -> dict:
    output_dict = {}
    for idx1, sentence1 in enumerate(sentences):
        for idx2, sentence2 in enumerate(sentences[idx1 + 1:]):
            actual_index = idx1 + 1 + idx2
            openai_embedding1 = [openai_embeddings_list[idx1]]
            openai_embedding2 = [openai_embeddings_list[actual_index]]

            output_dict[f'pair_{idx1}_{actual_index}'] = metric(openai_embedding1, openai_embedding2)[0][0]
    return output_dict

In [8]:
similarities_full = calculate_similarities(openai_embeddings_list=openai_embeddings)
similarities_full

{'pair_0_1': np.float64(0.7421513181181323),
 'pair_0_2': np.float64(0.058596090251280694),
 'pair_0_3': np.float64(0.04514082833929774),
 'pair_1_2': np.float64(0.019115193577802427),
 'pair_1_3': np.float64(0.06825793808363682),
 'pair_2_3': np.float64(0.7238462230267932)}

# Matryoshka embeddings

## Now we take only the first 64,128,256,512 to compare embeddings

In [34]:
openai_embeddings_64 = [x[:64] for x in openai_embeddings]
openai_embeddings_128 = [x[:128] for x in openai_embeddings]
openai_embeddings_256 = [x[:256] for x in openai_embeddings]
openai_embeddings_512 = [x[:512] for x in openai_embeddings]

### Just to verify, let's look at the length of one

In [35]:
len(openai_embeddings_128[0])

128

## Normalizing
OpenAI embeddings are normalized by default, but if you truncate by yourself, the normalization is lost

In [49]:
from sklearn.preprocessing import normalize

openai_embeddings_64 = [normalize([x])[0] for x in openai_embeddings_64]
openai_embeddings_128 = [normalize([x])[0] for x in openai_embeddings_128]
openai_embeddings_256 = [normalize([x])[0] for x in openai_embeddings_256]
openai_embeddings_512 = [normalize([x])[0] for x in openai_embeddings_512]

## Verify length from one sample

In [50]:
np.linalg.norm(openai_embeddings_64[0])

np.float64(1.0)

In [38]:
np.array(openai_embeddings_64).shape

(4, 64)

## Let us recalculate similarities using less dimensions

### 64

In [39]:
similarities_64 = calculate_similarities(openai_embeddings_64)
similarities_64

{'pair_0_1': np.float64(0.6850311633462725),
 'pair_0_2': np.float64(0.20489690628065071),
 'pair_0_3': np.float64(0.16382403008593263),
 'pair_1_2': np.float64(0.13412743954637257),
 'pair_1_3': np.float64(0.15776509987706538),
 'pair_2_3': np.float64(0.697904445069117)}

### 128

In [40]:
similarities_128 = calculate_similarities(openai_embeddings_128)
similarities_128

{'pair_0_1': np.float64(0.7359116060062697),
 'pair_0_2': np.float64(0.17322708849706525),
 'pair_0_3': np.float64(0.09730477629606829),
 'pair_1_2': np.float64(0.10296365205092117),
 'pair_1_3': np.float64(0.1119801447897345),
 'pair_2_3': np.float64(0.6969662547018949)}

## 256

In [41]:
similarities_256 = calculate_similarities(openai_embeddings_256)
similarities_256

{'pair_0_1': np.float64(0.7671354537635768),
 'pair_0_2': np.float64(0.05499389997816803),
 'pair_0_3': np.float64(0.09790841617710405),
 'pair_1_2': np.float64(0.025739163936461393),
 'pair_1_3': np.float64(0.12542754519516558),
 'pair_2_3': np.float64(0.7158468083896092)}

## 512

In [42]:
similarities_512 = calculate_similarities(openai_embeddings_512)
similarities_512

{'pair_0_1': np.float64(0.7534843824718538),
 'pair_0_2': np.float64(0.028904831027034256),
 'pair_0_3': np.float64(0.039964305381076794),
 'pair_1_2': np.float64(0.024245788101245204),
 'pair_1_3': np.float64(0.08292599551954621),
 'pair_2_3': np.float64(0.7433827317392555)}

# Build a dataframe to compare

In [52]:
import pandas as pd

combined_data = {
    'full': similarities_full,
    '64': similarities_64,
    '128': similarities_128,
    '256': similarities_256,
    '512': similarities_512,
}
df = pd.DataFrame(combined_data)
df.index.name = 'sentence_pair'
df

Unnamed: 0_level_0,full,64,128,256,512
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pair_0_1,0.742151,0.685031,0.735912,0.767135,0.753484
pair_0_2,0.058596,0.204897,0.173227,0.054994,0.028905
pair_0_3,0.045141,0.163824,0.097305,0.097908,0.039964
pair_1_2,0.019115,0.134127,0.102964,0.025739,0.024246
pair_1_3,0.068258,0.157765,0.11198,0.125428,0.082926
pair_2_3,0.723846,0.697904,0.696966,0.715847,0.743383


# Plot on a chart

In [54]:
import plotly.express as px

fig = px.line(
    df,
    title='Cosine Similarity vs. Embedding Dimensions',
    markers=True,
)
fig.show()

# Find percentage errors from full measurements

## full vs 64 dims

In [45]:
df['error_full_64'] = df['full']-df['64']
df

Unnamed: 0_level_0,full,64,128,256,512,error_full_64
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pair_0_1,0.742151,0.685031,0.735912,0.767135,0.753484,0.05712
pair_0_2,0.058596,0.204897,0.173227,0.054994,0.028905,-0.146301
pair_0_3,0.045141,0.163824,0.097305,0.097908,0.039964,-0.118683
pair_1_2,0.019115,0.134127,0.102964,0.025739,0.024246,-0.115012
pair_1_3,0.068258,0.157765,0.11198,0.125428,0.082926,-0.089507
pair_2_3,0.723846,0.697904,0.696966,0.715847,0.743383,0.025942


In [46]:
df['error_full_128'] = df['full']-df['128']
df['error_full_256'] = df['full']-df['256']
df['error_full_512'] = df['full']-df['512']
df

Unnamed: 0_level_0,full,64,128,256,512,error_full_64,error_full_128,error_full_256,error_full_512
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
pair_0_1,0.742151,0.685031,0.735912,0.767135,0.753484,0.05712,0.00624,-0.024984,-0.011333
pair_0_2,0.058596,0.204897,0.173227,0.054994,0.028905,-0.146301,-0.114631,0.003602,0.029691
pair_0_3,0.045141,0.163824,0.097305,0.097908,0.039964,-0.118683,-0.052164,-0.052768,0.005177
pair_1_2,0.019115,0.134127,0.102964,0.025739,0.024246,-0.115012,-0.083848,-0.006624,-0.005131
pair_1_3,0.068258,0.157765,0.11198,0.125428,0.082926,-0.089507,-0.043722,-0.05717,-0.014668
pair_2_3,0.723846,0.697904,0.696966,0.715847,0.743383,0.025942,0.02688,0.007999,-0.019537


In [47]:
errors_df = df[[
    'error_full_64',
    'error_full_128',
'error_full_256',
'error_full_512',]].copy()
errors_df

Unnamed: 0_level_0,error_full_64,error_full_128,error_full_256,error_full_512
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pair_0_1,0.05712,0.00624,-0.024984,-0.011333
pair_0_2,-0.146301,-0.114631,0.003602,0.029691
pair_0_3,-0.118683,-0.052164,-0.052768,0.005177
pair_1_2,-0.115012,-0.083848,-0.006624,-0.005131
pair_1_3,-0.089507,-0.043722,-0.05717,-0.014668
pair_2_3,0.025942,0.02688,0.007999,-0.019537


## Plot the absolute value of errors

In [48]:
fig = px.line(
    errors_df.abs(),
    title='Errors vs. Embedding Dimensions',
    markers=True,
)
fig.show()