In [3]:
import os

import openai
from dotenv import load_dotenv

load_dotenv()

True

# List of input sentences

In [4]:
sentences = [
    'What did the King wear?',
    'The king wore black clothes',
    'What is a red panda?',
    'The red panda is a small mammal.',
]

# Generate embeddings

In [5]:
# get embeddings for each sentence using openai embedding model
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
embedding_model = 'text-embedding-3-large'

openai_embeddings = []
for sentence in sentences:
    response = client.embeddings.create(
        model=embedding_model,
        input=sentence
    )
    embedding = response.data[0].embedding
    openai_embeddings.append(embedding)
    print(f'Received embedding for sentence: {sentence}')

Received embedding for sentence: What did the King wear?
Received embedding for sentence: The king wore black clothes
Received embedding for sentence: What is a red panda?
Received embedding for sentence: The red panda is a small mammal.


In [6]:
if len(openai_embeddings) == len(sentences) and len(openai_embeddings[0]) != 0:
    print(f'Received embeddings for {len(openai_embeddings)} sentences, embedding size: {len(openai_embeddings[0])}')

Received embeddings for 4 sentences, embedding size: 3072


# Get cosine similarity between every pair

In [7]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, cosine_similarity

metric = cosine_similarity
metric_name = "cosine_similarity"

## Function to calculate similarities

In [8]:
def calculate_similarities(openai_embeddings_list: list) -> dict:
    output_dict = {}
    for idx1, sentence1 in enumerate(sentences):
        for idx2, sentence2 in enumerate(sentences[idx1 + 1:]):
            actual_index = idx1 + 1 + idx2
            openai_embedding1 = [openai_embeddings_list[idx1]]
            openai_embedding2 = [openai_embeddings_list[actual_index]]

            output_dict[f'pair_{idx1}_{actual_index}'] = metric(openai_embedding1, openai_embedding2)[0][0]
    return output_dict

In [9]:
similarities_full = calculate_similarities(openai_embeddings_list=openai_embeddings)
similarities_full

{'pair_0_1': np.float64(0.7421513181181323),
 'pair_0_2': np.float64(0.058596090251280694),
 'pair_0_3': np.float64(0.045106643742737304),
 'pair_1_2': np.float64(0.019115193577802427),
 'pair_1_3': np.float64(0.06823949306808158),
 'pair_2_3': np.float64(0.7238424520758902)}

# Matryoshka embeddings

## Now we take only the first 64,128,256,512 to compare embeddings

In [10]:
openai_embeddings_64 = [x[:64] for x in openai_embeddings]
openai_embeddings_128 = [x[:128] for x in openai_embeddings]
openai_embeddings_256 = [x[:256] for x in openai_embeddings]
openai_embeddings_512 = [x[:512] for x in openai_embeddings]

### Just to verify, let's look at the length of one

In [11]:
len(openai_embeddings_128[0])

128

## Let us recalculate similarities using less dimensions

### 64

In [12]:
similarities_64 = calculate_similarities(openai_embeddings_64)
similarities_64

{'pair_0_1': np.float64(0.6850311633462725),
 'pair_0_2': np.float64(0.20489690628065066),
 'pair_0_3': np.float64(0.1638284932131196),
 'pair_1_2': np.float64(0.13412743954637252),
 'pair_1_3': np.float64(0.15762862864144578),
 'pair_2_3': np.float64(0.6978508046247627)}

### 128

In [13]:
similarities_128 = calculate_similarities(openai_embeddings_128)
similarities_128

{'pair_0_1': np.float64(0.7359116060062696),
 'pair_0_2': np.float64(0.17322708849706533),
 'pair_0_3': np.float64(0.09728688309319741),
 'pair_1_2': np.float64(0.10296365205092117),
 'pair_1_3': np.float64(0.11192099262223268),
 'pair_2_3': np.float64(0.6969381043835294)}

## 256

In [14]:
similarities_256 = calculate_similarities(openai_embeddings_256)
similarities_256

{'pair_0_1': np.float64(0.7671354537635773),
 'pair_0_2': np.float64(0.05499389997816802),
 'pair_0_3': np.float64(0.09791882880966658),
 'pair_1_2': np.float64(0.02573916393646139),
 'pair_1_3': np.float64(0.12542713301935848),
 'pair_2_3': np.float64(0.7158187173229467)}

## 512

In [15]:
similarities_512 = calculate_similarities(openai_embeddings_512)
similarities_512

{'pair_0_1': np.float64(0.7534843824718543),
 'pair_0_2': np.float64(0.02890483102703427),
 'pair_0_3': np.float64(0.03995788579986332),
 'pair_1_2': np.float64(0.024245788101245218),
 'pair_1_3': np.float64(0.08291184763028255),
 'pair_2_3': np.float64(0.7433601892068534)}

# Build a dataframe to compare

In [16]:
import pandas as pd

combined_data = {
    'full': similarities_full,
    '64': similarities_64,
    '128': similarities_128,
    '256': similarities_256,
    '512': similarities_512,
}
df = pd.DataFrame(combined_data)
df.index.name = 'sentence_pair'
df

Unnamed: 0_level_0,full,64,128,256,512
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pair_0_1,0.742151,0.685031,0.735912,0.767135,0.753484
pair_0_2,0.058596,0.204897,0.173227,0.054994,0.028905
pair_0_3,0.045107,0.163828,0.097287,0.097919,0.039958
pair_1_2,0.019115,0.134127,0.102964,0.025739,0.024246
pair_1_3,0.068239,0.157629,0.111921,0.125427,0.082912
pair_2_3,0.723842,0.697851,0.696938,0.715819,0.74336


# Plot on a chart

In [17]:
import plotly.express as px

fig = px.line(
    df,
    title='Cosine Similarity vs. Embedding Dimensions',
    markers=True,
)
fig.show()

# Find percentage errors from full measurements

## full vs 64 dims

In [18]:
df['error_full_64'] = df['full']-df['64']
df

Unnamed: 0_level_0,full,64,128,256,512,error_full_64
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pair_0_1,0.742151,0.685031,0.735912,0.767135,0.753484,0.05712
pair_0_2,0.058596,0.204897,0.173227,0.054994,0.028905,-0.146301
pair_0_3,0.045107,0.163828,0.097287,0.097919,0.039958,-0.118722
pair_1_2,0.019115,0.134127,0.102964,0.025739,0.024246,-0.115012
pair_1_3,0.068239,0.157629,0.111921,0.125427,0.082912,-0.089389
pair_2_3,0.723842,0.697851,0.696938,0.715819,0.74336,0.025992


In [19]:
df['error_full_128'] = df['full']-df['128']
df['error_full_256'] = df['full']-df['256']
df['error_full_512'] = df['full']-df['512']
df

Unnamed: 0_level_0,full,64,128,256,512,error_full_64,error_full_128,error_full_256,error_full_512
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
pair_0_1,0.742151,0.685031,0.735912,0.767135,0.753484,0.05712,0.00624,-0.024984,-0.011333
pair_0_2,0.058596,0.204897,0.173227,0.054994,0.028905,-0.146301,-0.114631,0.003602,0.029691
pair_0_3,0.045107,0.163828,0.097287,0.097919,0.039958,-0.118722,-0.05218,-0.052812,0.005149
pair_1_2,0.019115,0.134127,0.102964,0.025739,0.024246,-0.115012,-0.083848,-0.006624,-0.005131
pair_1_3,0.068239,0.157629,0.111921,0.125427,0.082912,-0.089389,-0.043681,-0.057188,-0.014672
pair_2_3,0.723842,0.697851,0.696938,0.715819,0.74336,0.025992,0.026904,0.008024,-0.019518


In [20]:
errors_df = df[[
    'error_full_64',
    'error_full_128',
'error_full_256',
'error_full_512',]].copy()
errors_df

Unnamed: 0_level_0,error_full_64,error_full_128,error_full_256,error_full_512
sentence_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pair_0_1,0.05712,0.00624,-0.024984,-0.011333
pair_0_2,-0.146301,-0.114631,0.003602,0.029691
pair_0_3,-0.118722,-0.05218,-0.052812,0.005149
pair_1_2,-0.115012,-0.083848,-0.006624,-0.005131
pair_1_3,-0.089389,-0.043681,-0.057188,-0.014672
pair_2_3,0.025992,0.026904,0.008024,-0.019518


## Plot the absolute value of errors

In [24]:
fig = px.line(
    errors_df.abs(),
    title='Errors vs. Embedding Dimensions',
    markers=True,
)
fig.show()