# What in the world is a Vector Embedding?

In [None]:
! pip install azure-identity
! pip install openai
! pip install python-dotenv
! pip install numpy pandas

In [None]:
import os
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from dotenv import load_dotenv
from openai import AzureOpenAI

# Set up OpenAI client based on environment variables
load_dotenv()
AZURE_OPENAI_ENDPOINT: str = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY: str = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION: str = "2023-05-15"
AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME: str = os.getenv(
    "AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME"
)


credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(
    credential, "https://cognitiveservices.azure.com/.default"
)

# Set this flag to True if you are using Azure Active Directory
use_aad_for_aoai = True

if use_aad_for_aoai:
    # Use Azure Active Directory (AAD) authentication
    client = AzureOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version=AZURE_OPENAI_API_VERSION,
        azure_ad_token_provider=token_provider,
    )
else:
    # Use API key authentication
    client = AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        api_version=AZURE_OPENAI_API_VERSION,
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
    )


# Example function to generate document embedding
def generate_embedding(text: str):
    # Generate embeddings for the provided text using the specified model
    embeddings_response = client.embeddings.create(
        model=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME, input=text
    )
    # Extract the embedding data from the response
    return embeddings_response.data[0].embedding

In [3]:
# Generate an embedding for the provided text
generate_embedding("The Brooklyn Nets are better than the NY Knicks.")

[-0.007678062,
 -0.022477992,
 0.018143497,
 -0.023449736,
 -0.0050441227,
 -0.020854155,
 -0.0047308626,
 -0.01244089,
 0.010823447,
 -0.008528339,
 -0.0038997652,
 0.009436153,
 0.0035001992,
 -0.010989666,
 -0.0019275069,
 -0.021608535,
 0.026032532,
 -0.020598432,
 0.029484782,
 -0.017095037,
 -0.018808376,
 0.027566865,
 0.017389117,
 -0.047973506,
 -0.005312631,
 0.015982645,
 0.0068277856,
 -0.0068917163,
 0.027362287,
 -0.021531818,
 0.0154711995,
 0.005421313,
 -0.015573489,
 -0.0337042,
 -0.026390543,
 -0.0099987425,
 -0.0072369413,
 -0.022324558,
 0.0126390755,
 -0.0066999244,
 0.022797644,
 -0.02050893,
 0.0059135784,
 -0.023066152,
 -0.023347447,
 0.0079593565,
 0.010689192,
 -0.0027394253,
 -0.0052550933,
 0.011315713,
 0.00882242,
 0.0027026653,
 -0.015778067,
 -0.023577597,
 -0.020138131,
 -0.016404586,
 -0.018680515,
 -0.028359605,
 0.0057313764,
 -0.009301899,
 -0.011430787,
 -0.0090014255,
 -0.022094408,
 0.0007200181,
 0.0063738786,
 -0.0032892283,
 0.014103086,
 -0

# Document similarity modeled as cosine distance

In [9]:
import numpy as np


def cosine_similarity(vec_a, vec_b):
    """Calculate the cosine similarity between two vectors."""
    return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


# Example sentences to compare.   
ny_sentences = [  
    "The Empire State Building lights up the Manhattan skyline.",  
    "The Empire State Building lights up the Manhattan skyline.",  
    "The Empire State Building lights up the Manhattan skyline.",  
]  
  
# Sentences to compare against, with a mix of related and unrelated topics.  
comparison_sentences = [  
    "A kangaroo hops across the Australian Outback.",  
    "Cherry blossoms bring spring to life in Kyoto, Japan.",  
    "The Empire State Building lights up the Manhattan skyline.",  
]  

embeddings_ny = [generate_embedding(sentence) for sentence in ny_sentences]  
embeddings_comparison = [generate_embedding(sentence) for sentence in comparison_sentences] 

# Calculating and printing the cosine similarity scores.  
for i, (ny, comparison) in enumerate(zip(ny_sentences, comparison_sentences)):  
    similarity_score = cosine_similarity(embeddings_ny[i], embeddings_comparison[i])  
    print(f"NY: {ny}\nComparison: {comparison}\nScore: {similarity_score:.4f}\n") 

NY: The Empire State Building lights up the Manhattan skyline.
Comparison: A kangaroo hops across the Australian Outback.
Score: 0.7595

NY: The Empire State Building lights up the Manhattan skyline.
Comparison: Cherry blossoms bring spring to life in Kyoto, Japan.
Score: 0.8015

NY: The Empire State Building lights up the Manhattan skyline.
Comparison: The Empire State Building lights up the Manhattan skyline.
Score: 1.0000



# Vector Search

In [15]:
import json  
  
# Initialize an empty list to hold all the vectors  
dbpedia_vectors = []  
  
# Load in vectors for Wikipedia articles from a .jsonl file  
with open('dbpedia_samples_embeddings.jsonl', 'r', encoding='utf-8') as json_file:  
    for line in json_file:  
        # Each line is a separate JSON object  
        dbpedia_vectors.append(json.loads(line))  
  


In [38]:
import numpy as np  
import pandas as pd  
  
# Sample query  
query = "notable educational instituions"  
  
# Generate embedding for the query  
query_vector = generate_embedding(query)  
query_vector = np.array(query_vector)  
  
# Calculate the cosine similarity between the query and each vector  
similarity_scores = [cosine_similarity(query_vector, np.array(article['vector'])) for article in dbpedia_vectors]  
  
# Assuming you want to also capture the text associated with each vector  
results = [(article['text'], score) for article, score in zip(dbpedia_vectors, similarity_scores)]  
  
# Create a DataFrame to display the results  
df = pd.DataFrame(results, columns=['Text', 'Similarity Score'])  
  
# Sort the DataFrame by the similarity score in descending order  
df_sorted = df.sort_values(by='Similarity Score', ascending=False)  
  
# Display the top results  
print(df_sorted.head())  


                                                  Text  Similarity Score
123   Blackburn High School is a public secondary s...          0.770751
157   Michael E. DeBakey High School for Health Pro...          0.769030
121   Grand Canyon Preparatory Academy is a public ...          0.757726
101   The Chiltern School is a coeducational specia...          0.753215
155   The University of Nevada School of Medicine i...          0.748609
