In [1]:
import os

import openai
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
sentences_en=['The king wore black clothes', 'The queen wore black clothes', 'The joker wore black clothes']
sentences_hi=['राजा ने काले कपड़े पहने थे', 'रानी ने काले कपड़े पहने थे', 'जोकर ने काले कपड़े पहने थे']
sentences_jp=['王は黒い服を着ていた', '女王は黒い服を着ていた', 'ジョーカーは黒い服を着ていた']
sentences = [sentences_en, sentences_hi, sentences_jp]
sentences

[['The king wore black clothes',
  'The queen wore black clothes',
  'The joker wore black clothes'],
 ['राजा ने काले कपड़े पहने थे',
  'रानी ने काले कपड़े पहने थे',
  'जोकर ने काले कपड़े पहने थे'],
 ['王は黒い服を着ていた', '女王は黒い服を着ていた', 'ジョーカーは黒い服を着ていた']]

In [3]:
# get embeddings for each sentence using openai embedding model
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
embedding_model = 'text-embedding-3-large'

output_sentences=[]
openai_embeddings = []
for sentence_group in sentences:
    for sentence in sentence_group:
        output_sentences.append(sentence)
        response = client.embeddings.create(
            model=embedding_model,
            input=sentence
        )
        embedding = response.data[0].embedding
        openai_embeddings.append(embedding)
        print(f'Received embedding for sentence: {sentence}')

Received embedding for sentence: The king wore black clothes
Received embedding for sentence: The queen wore black clothes
Received embedding for sentence: The joker wore black clothes
Received embedding for sentence: राजा ने काले कपड़े पहने थे
Received embedding for sentence: रानी ने काले कपड़े पहने थे
Received embedding for sentence: जोकर ने काले कपड़े पहने थे
Received embedding for sentence: 王は黒い服を着ていた
Received embedding for sentence: 女王は黒い服を着ていた
Received embedding for sentence: ジョーカーは黒い服を着ていた


In [4]:
print(openai_embeddings[0])

[0.018917368724942207, 0.0010748504428192973, 0.0046965498477220535, -0.036843687295913696, 0.005884350743144751, 0.001779879559762776, 0.004944312386214733, 0.0024247898254543543, 0.01378723420202732, -0.014515946619212627, -0.015244659036397934, 0.010362287051975727, -0.015113490633666515, -0.002124195918440819, 0.02983347699046135, 0.013736224733293056, -0.02945454604923725, 0.005476272199302912, 0.011754127219319344, -0.028915299102663994, 0.01372893713414669, -0.023814314976334572, -0.026583420112729073, -0.010894247330725193, -0.00891214981675148, -0.020462237298488617, 0.012118483893573284, 0.02463047206401825, -0.015215510502457619, 0.024761639535427094, 0.026306509971618652, 0.0010821375763043761, 0.01539040170609951, 0.019252575933933258, -0.038884080946445465, -0.01655634120106697, 0.010260267183184624, 0.030489318072795868, -0.003814808325842023, -0.013175115920603275, -0.00032313831616193056, -0.01306580938398838, -0.0756111741065979, 0.003191759344190359, 0.01534667890518

In [5]:
if len(openai_embeddings) == len(sentences) and len(openai_embeddings[0]) != 0:
    print(f'Received embeddings for {len(openai_embeddings)} sentences, embedding size: {len(openai_embeddings[0])}')

## Run PCA

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3).fit_transform(openai_embeddings)

In [7]:
pca

array([[-0.21926605, -0.13283007,  0.36733636],
       [-0.2619433 , -0.35505411,  0.29644513],
       [-0.09927948,  0.48160333,  0.34937205],
       [ 0.49279014, -0.10528916, -0.03501184],
       [ 0.51087419, -0.26476916, -0.07584559],
       [ 0.49781721,  0.20470532, -0.04057411],
       [-0.31943343, -0.07435037, -0.28382969],
       [-0.38401104, -0.24444471, -0.31656965],
       [-0.21754823,  0.49042893, -0.26132266]])

## Visualize using plotly

## Build a dataframe

In [26]:
import pandas as pd

languages = ['en'] * len(sentences_en) + \
            ['hi'] * len(sentences_hi) + \
            ['jp'] * len(sentences_jp)

df = pd.DataFrame()
df['sentence'] = output_sentences
df['language']=languages
df['openai_embeddings'] = openai_embeddings
df['pca_x'] = pca[:,0]
df['pca_y'] = pca[:,1]
df['pca_z'] = pca[:,2]
df.head()

Unnamed: 0,sentence,language,openai_embeddings,pca_x,pca_y,pca_z
0,The king wore black clothes,en,"[0.018917368724942207, 0.0010748504428192973, ...",-0.219266,-0.13283,0.367336
1,The queen wore black clothes,en,"[0.004186331760138273, -0.00143697508610785, 0...",-0.261943,-0.355054,0.296445
2,The joker wore black clothes,en,"[0.023841356858611107, -0.02213025465607643, -...",-0.099279,0.481603,0.349372
3,राजा ने काले कपड़े पहने थे,hi,"[-0.013421924784779549, -0.009738419204950333,...",0.49279,-0.105289,-0.035012
4,रानी ने काले कपड़े पहने थे,hi,"[-0.029133357107639313, -0.021089918911457062,...",0.510874,-0.264769,-0.075846


In [27]:
import plotly.express as px

fig = px.scatter_3d(
    data_frame=df,
    x='pca_x',
    y='pca_y',
    z='pca_z',
    text=output_sentences,
    color='language',
    title='3D PCA of sentences using {}'.format(embedding_model),
)

In [28]:
# Improve layout
fig.update_traces(textposition='top center')
fig.write_image('embedding_3d.png')
fig.show()

## Analysis of distances

## True distance

In [10]:
from sklearn.metrics.pairwise import cosine_distances

# cosine distance = 1 - cosine_similarity
# find true distance between english and hindi first statements
true_embedding_distances_en_hi = cosine_distances([openai_embeddings[0]], [openai_embeddings[3]])
true_embedding_distances_en_hi

array([[0.42366081]])

In [11]:
# find true distance between english and japanese first statements
true_embedding_distances_en_jp = cosine_distances([openai_embeddings[0]], [openai_embeddings[6]])
true_embedding_distances_en_jp

array([[0.26631291]])

## PCA distances

In [12]:
# find pca distance between english and hindi first statements
pc_embedding_distances_en_hi = cosine_distances([pca[0]], [pca[3]])
pc_embedding_distances_en_hi

array([[1.47256509]])

In [13]:
# find pca distance between english and japanese first statements
pca_embedding_distances_en_jp = cosine_distances([pca[0]], [pca[6]])
pca_embedding_distances_en_jp

array([[1.1252976]])