In [1]:
import os
import azure.identity
import dotenv
import openai

In [4]:
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")

# azure_credential = azure.identity.DefaultAzureCredential()
# token_provider = azure.identity.get_bearer_token_provider(azure_credential,
#     "https://cognitiveservices.azure.com/.default")
# openai_client = openai.AzureOpenAI(
#     api_version="2024-06-01",
#     azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
#     azure_ad_token_provider=token_provider)

openai.api_type = "azure"
openai.api_base = f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com/"
openai.api_version = "2024-06-01"
openai.api_key = AZURE_OPENAI_API_KEY


Every word or a sentence can be transformed into an embedding vector which is dimensionally placed in such a way that, vector that are closer are semantically similar, vice-versa

openi ai ada 002 embedding model is a 1536 size vector

In [6]:
sentence = "A dog just walked past my house and yipped yipped like a Martian"

response = openai.embeddings.create(
    model=AZURE_OPENAI_ADA_DEPLOYMENT,  
    input=[sentence] 
)
vector = response.data[0].embedding
vector

[-0.014236196,
 -0.007189021,
 -0.027208673,
 0.01232772,
 -0.0037782658,
 0.022089316,
 0.009626196,
 -0.018027358,
 -0.00064797903,
 -0.025016503,
 0.016015721,
 -0.0027934022,
 0.010593329,
 -0.0055835806,
 0.011515329,
 0.011657176,
 0.02394621,
 0.010135553,
 0.016699161,
 0.027053932,
 -0.011025315,
 0.022463273,
 0.019303972,
 -0.023546463,
 -0.015409651,
 -0.0029529792,
 0.021392979,
 -0.015074378,
 0.0009292535,
 -0.009465007,
 0.014081455,
 -0.00596721,
 -0.038711105,
 0.0053611402,
 -0.021689568,
 -0.020722434,
 0.016531525,
 -0.010748071,
 0.009323161,
 -0.023198294,
 -0.0050838953,
 0.007479161,
 0.005751217,
 -0.025816001,
 0.0072599445,
 0.014803581,
 -0.0048743496,
 -0.0034075317,
 -0.03071614,
 0.017021539,
 0.022128,
 0.0050548813,
 -0.031180365,
 -0.0016417081,
 -0.012753259,
 0.013978294,
 0.0064636716,
 0.005802797,
 0.011528224,
 -0.017601818,
 -0.007621007,
 0.0011637833,
 -0.01586098,
 0.0077048256,
 -0.003069035,
 -0.011753889,
 -0.0050806715,
 -0.008143259,
 -

In [12]:
len(vector)

1536

### Document similarity

In [7]:
import numpy as np
import pandas as pd


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [8]:
sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The new movie is awesome',
              'This recent movie is so good',
              'djkshsjdkhfsjdfkhsd']

In [9]:
def get_embeddings(sentences):
    embeddings_response = openai.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]

In [10]:
embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

In [11]:
for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000
The new movie is awesome 		 This recent movie is so good 		 Score: 0.9191
The new movie is awesome 		 djkshsjdkhfsjdfkhsd 		 Score: 0.7461


same sentences have same embedding vectors, thus similarity is 1, apparently worst similarity using openai embeddings still like above 0.65 or so 

We can use openai embeddings and cosine siilarity to do a semantic document retrieval

below file has pre retrived openai ada embeddings for several movies, we can search similar movie titles by fetching top similar movies

In [14]:
import json

# Load in vectors for movie titles
with open('openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)
    
len(movie_vectors['Snow White and the Seven Dwarfs'])

1536

In [18]:
curr_movie = "Aladdin"

movie_embedding = openai.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[curr_movie]).data[0].embedding
len(movie_embedding)

1536

In [19]:
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(movie_embedding, movie_vectors[movie])))

df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
135,Aladdin,0.999997
4,Cinderella,0.898171
490,Alice in Wonderland,0.885146
458,Enchanted,0.883453
283,Mulan,0.882532
7,Sleeping Beauty,0.880627
114,Beauty and the Beast,0.8799
304,Tarzan,0.876643
1,Pinocchio,0.872049
179,The Lion King,0.870428


at max openai ada allows 8192 tokens as limit to the model

In [20]:
df.tail(10)

Unnamed: 0,Movie,Score
182,In the Army Now,0.757628
253,Metro,0.757487
250,The War at Home,0.756588
505,Mars Needs Moms,0.756197
353,Frank McKlusky C.I.,0.754254
269,Washington Square,0.753238
141,The Cemetery Club,0.753162
532,The Fifth Estate,0.752791
188,Terminal Velocity,0.742
76,Disorganized Crime,0.741852


as we see the least similar has a score of 0.74

although we must keep in mind that the similarity search is done on title solely but still these models have trained on vast data that it knows which movies are similar (not excellent either)

thus to see acutal similar movies we could also embed the descriptions and then compare vectors