In [1]:
import os

import numpy as np
import openai
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
sentences_en=['The king in black', 'The queen in black', 'The joker in black']
sentences_hi=['काले रंग में राजा', 'काले कपड़ों में रानी', 'काले कपड़ों में जोकर']
sentences_jp=['黒衣の王', '黒衣の女王', '黒のジョーカー']
sentences = [sentences_en, sentences_hi, sentences_jp]
sentences

[['The king in black', 'The queen in black', 'The joker in black'],
 ['काले रंग में राजा', 'काले कपड़ों में रानी', 'काले कपड़ों में जोकर'],
 ['黒衣の王', '黒衣の女王', '黒のジョーカー']]

In [3]:
# get embeddings for each sentence using openai embedding model
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
embedding_model = 'text-embedding-3-large'

output_sentences=[]
openai_embeddings = []
for sentence_group in sentences:
    for sentence in sentence_group:
        output_sentences.append(sentence)
        response = client.embeddings.create(
            model=embedding_model,
            input=sentence
        )
        embedding = response.data[0].embedding
        openai_embeddings.append(embedding)
        print(f'Received embedding for sentence: {sentence}')

Received embedding for sentence: The king in black
Received embedding for sentence: The queen in black
Received embedding for sentence: The joker in black
Received embedding for sentence: काले रंग में राजा
Received embedding for sentence: काले कपड़ों में रानी
Received embedding for sentence: काले कपड़ों में जोकर
Received embedding for sentence: 黒衣の王
Received embedding for sentence: 黒衣の女王
Received embedding for sentence: 黒のジョーカー


In [4]:
print(openai_embeddings[0])

[0.004411753267049789, -0.0095773134380579, -5.3157014917815104e-05, 0.011949821375310421, 0.032040756195783615, 0.004387948662042618, 0.001402477384544909, -0.008442635647952557, 0.00028044587816111743, 0.02796226367354393, 0.007125457748770714, 0.02074158750474453, -0.055384960025548935, -0.0008009197190403938, 0.009728075005114079, 0.01871027797460556, -0.027930524200201035, 0.00950590055435896, 0.005502789281308651, -0.010489816777408123, -0.009902640245854855, -0.014917438849806786, -0.020630501210689545, 0.007331762928515673, -0.02318551018834114, -0.0005306403036229312, 0.008097471669316292, 0.011069057509303093, -0.03259619325399399, 0.006125671789050102, 0.016647227108478546, 0.018884843215346336, 0.06620804220438004, 0.04567275568842888, -0.035389244556427, -0.009188507683575153, -0.012822650372982025, 0.011687972582876682, 0.003993192221969366, -0.04326057434082031, -0.005205234047025442, -0.04132448136806488, -0.07471415400505066, 0.016012443229556084, 0.041737090796232224,

In [5]:
if len(openai_embeddings) == len(sentences) and len(openai_embeddings[0]) != 0:
    print(f'Received embeddings for {len(openai_embeddings)} sentences, embedding size: {len(openai_embeddings[0])}')

## Run PCA

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2).fit_transform(openai_embeddings)

In [7]:
pca

array([[ 0.147459  , -0.24169702],
       [ 0.1482626 , -0.24342193],
       [ 0.14670906,  0.46376368],
       [-0.39228294, -0.35631172],
       [-0.59794374, -0.11644365],
       [-0.53040933,  0.4037939 ],
       [ 0.36705605, -0.21652281],
       [ 0.39004021, -0.15357688],
       [ 0.32110908,  0.46041642]])

## Visualize using plotly

In [8]:
import plotly.express as px

fig = px.scatter(
    x=pca[:, 0],
    y=pca[:, 1],
    text=output_sentences,
    title='2D PCA of sentences using {}'.format(embedding_model),
)

In [12]:
# Improve layout
fig.update_traces(textposition='top center')
fig.update_layout(
    xaxis_title="Principal Component 1",
    yaxis_title="Principal Component 2",
)
fig.write_image('embedding.png')
fig.show()

## Analysis of distances

In [10]:
from sklearn.metrics.pairwise import cosine_distances

# cosine distance = 1 - cosine_similarity
# find true distance between english and hindi first statements
true_embedding_distances_en_hi = cosine_distances([openai_embeddings[0]], [openai_embeddings[3]])
true_embedding_distances_en_hi

array([[0.4823945]])

In [11]:
# find true distance between english and japanese first statements
true_embedding_distances_en_jp = cosine_distances([openai_embeddings[0]], [openai_embeddings[6]])
true_embedding_distances_en_jp

array([[0.39802545]])