In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [11]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [12]:
embeddings = embedding.embed_query("life")

In [13]:
print(f"Length = {len(embeddings)}")
print(embeddings[:2])

Length = 1536
[0.01641558564532278, -0.013453375283007222]


In [14]:
embeddings = embedding.embed_query("What is the meaning of life?")
print(f"Length = {len(embeddings)}")
print(embeddings[:2])

Length = 1536
[0.004346078746023493, -0.029653417073637556]


#### Similarity

- Calculate the similarity between two sentences as a number between 0 and 1.
- Try out your own sentences and check if the similarity calculations match your intuition.

In [21]:
import numpy as np

In [24]:
emb_1 = embedding.embed_query("What is the meaning of life?") # 42!

emb_2 = embedding.embed_query("I need to know why I exist")

emb_3 = embedding.embed_query("Walking is slower than running")



In [25]:
print(np.dot(emb_1,emb_2)) 
print(np.dot(emb_2,emb_3))
print(np.dot(emb_1,emb_3))

0.816630077906561
0.7200559422559096
0.7231732527315546


#### From word to sentence embeddings
- One possible way to calculate sentence embeddings from word embeddings is to take the average of the word embeddings.
- This ignores word order and context, so two sentences with different meanings, but the same set of words will end up with the same sentence embedding.

In [26]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."

- Remove stop words like ["the", "in", "for", "an", "is"] and punctuation.

In [27]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]

Generate one embedding for each word. So this is a list of three lists.

In [34]:
embeddings_1 = [emb for emb in embedding.embed_query(" ".join(in_pp_1))]

In [35]:
print(embeddings_1)

[0.005602288064978442, -0.002243516503076898, 0.011100528771826344, -0.019703926536455172, -0.02804070341153578, 0.034309543700384776, -0.0008262806230278626, -0.013103436690548942, 0.001472917270321769, -0.027338385606799784, 0.0013891918364490826, -0.0001284331812059961, 0.014618623083695922, -0.008798486561123593, -0.0007364586351608844, -0.014046363545300526, 0.017297836036299308, -0.02936730536505502, 0.027936656053405243, -0.005534007219035919, -0.0005596597916873018, 0.01654349455249804, 0.0033977893212004968, -0.028482904683590576, -0.007335323381380596, -0.009812945508928634, 0.009084615864660002, -0.009773927749629681, 0.014371511539458464, -0.0064249108603835195, 0.018624437989818544, -0.004425254654433144, 0.00989098102752654, -0.01585418359885094, -0.025387501367142455, -0.005716089630103077, -0.008427818313444831, -0.01027465519497162, 0.010762375789224664, 0.007523908752330913, -0.0005486860759883017, -0.010976972999707615, -0.02061433905745225, -0.01763598995022356, 0.0

In [38]:
embeddings_1 = embedding.embed_query(" ".join(in_pp_1))
print(embeddings_1)
embeddings_2 = embedding.embed_query(" ".join(in_pp_2))
print(embeddings_2)

[0.005617694580932645, -0.0022025392850890837, 0.011033828600766175, -0.019752959202100656, -0.028114481485086537, 0.034252331376339984, -0.0008135578997882055, -0.013120957924336747, 0.0014848853771632065, -0.027360253607568916, 0.0015182079831637248, -0.00011734008393897819, 0.014616408690668386, -0.008797154014297924, -0.0007757652363744595, -0.014135264155506713, 0.01729521750280048, -0.029362857955146836, 0.028010449026038484, -0.005533169139278697, -0.0006128098594447556, 0.016566997740044873, 0.0033972748333600017, -0.02837455890741629, -0.007399231815678644, -0.009791953693735082, 0.009076737988360482, -0.009785452130705877, 0.014343326279635032, -0.006404431670580187, 0.01862161645450163, -0.004502608069213122, 0.009934996462280965, -0.015734745518241215, -0.025526699211976298, -0.005757486567963924, -0.008439545695949326, -0.01026659666586755, 0.010819263982286055, 0.007542275049885823, -0.0005567304440657867, -0.010949302693450931, -0.020559201446497113, -0.01758130303989224