### Set up

In [1]:
import json
import os

import dotenv
import numpy as np
import openai
import pandas as pd

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()

AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

openai_client = openai.AzureOpenAI(
    api_version="2023-07-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"))

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding
    
def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]


### Vector representations

In [2]:
# optimal size to embed is ~512 tokens
vector = get_embedding("Microsoft AI Tour 2024!") # 8192 tokens limit

In [3]:
print(f"type: {type(vector)} | length: {len(vector)}")

type: <class 'list'> | length: 1536


In [4]:
vector[0:20]

[-0.0075791445560753345,
 -0.03737623617053032,
 0.002948740031570196,
 0.010529549792408943,
 -0.0012054701801389456,
 0.01567777246236801,
 -0.02546805702149868,
 0.0060706413350999355,
 -0.014425682835280895,
 -0.004016014281660318,
 0.027412792667746544,
 0.015810973942279816,
 -0.01714298501610756,
 -0.018461676314473152,
 -7.055497553665191e-05,
 -0.009110957384109497,
 0.012474286369979382,
 -0.023843001574277878,
 0.023843001574277878,
 -0.020726095885038376]

### Document similarity modeled as cosine distance

In [5]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['djkshsjdkhfsjdfkhsd',
              'This recent movie is so good',
              'The new movie is awesome']

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 djkshsjdkhfsjdfkhsd 		 Score: 0.7467
The new movie is awesome 		 This recent movie is so good 		 Score: 0.9191
The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000


### Vector search

In [6]:
# Load in vectors for movie titles
with open('openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)

In [7]:
# Compute vector for query
query = "Zootopia"

embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
561,Zootopia,0.999998
304,Tarzan,0.866327
13,The Jungle Book,0.865939
375,The Jungle Book 2,0.86335
221,Toy Story,0.855762
529,Monsters University,0.852388
179,The Lion King,0.850464
293,A Bug's Life,0.850421
28,The Fox and the Hound,0.848254
490,Alice in Wonderland,0.845954
