# What in the world is a Vector Embedding?

In [2]:
! pip install azure-identity
! pip install openai
! pip install python-dotenv
! pip install numpy pandas



In [3]:
import os
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from dotenv import load_dotenv
from openai import AzureOpenAI

# Set up OpenAI client based on environment variables
load_dotenv()
AZURE_OPENAI_ENDPOINT: str = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY: str = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION: str = "2023-05-15"
AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME: str = os.getenv(
    "AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME"
)


credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(
    credential, "https://cognitiveservices.azure.com/.default"
)

# Set this flag to True if you are using Azure Active Directory
use_aad_for_aoai = True

if use_aad_for_aoai:
    # Use Azure Active Directory (AAD) authentication
    client = AzureOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version=AZURE_OPENAI_API_VERSION,
        azure_ad_token_provider=token_provider,
    )
else:
    # Use API key authentication
    client = AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        api_version=AZURE_OPENAI_API_VERSION,
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
    )


# Example function to generate document embedding
def generate_embedding(text: str):
    # Generate embeddings for the provided text using the specified model
    embeddings_response = client.embeddings.create(
        model=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME, input=text
    )
    # Extract the embedding data from the response
    return embeddings_response.data[0].embedding

In [4]:
# Generate an embedding for the provided text
generate_embedding("The NY Knicks are going to win the NBA Finals this year!")

[-0.0022307680919766426,
 -0.015993472188711166,
 -0.0035352003760635853,
 -0.02250933274626732,
 -0.018211638554930687,
 -0.0002172084350604564,
 -0.015199471265077591,
 -0.01590524986386299,
 -0.005400475580245256,
 0.0036832881160080433,
 0.03296999633312225,
 0.007851799950003624,
 0.003491088980808854,
 0.0002471410552971065,
 0.003935352433472872,
 -0.025017371401190758,
 0.03304561600089073,
 -0.012691432610154152,
 0.029264653101563454,
 -0.029995638877153397,
 -0.0029050398152321577,
 0.0216901246458292,
 0.005879397504031658,
 -0.026895249262452126,
 -0.004209471866488457,
 0.011134935542941093,
 -0.0026057136710733175,
 -0.011147539131343365,
 0.01590524986386299,
 -0.03327247500419617,
 0.01015818677842617,
 0.010624505579471588,
 -0.027096901088953018,
 -0.05005994811654091,
 -0.021715329959988594,
 -0.003929050639271736,
 -0.005696650594472885,
 -0.025408070534467697,
 0.006257493514567614,
 -0.008450452238321304,
 0.017342016100883484,
 -0.016447188332676888,
 -0.0021945

# Document similarity modeled as cosine distance

In [9]:
import numpy as np


def cosine_similarity(vec_a, vec_b):
    """Calculate the cosine similarity between two vectors."""
    return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


# Example sentences to compare.   
ny_sentences = [  
    "The Empire State Building lights up the Manhattan skyline.",  
    "The Empire State Building lights up the Manhattan skyline.",  
    "The Empire State Building lights up the Manhattan skyline.",  
]  
  
# Sentences to compare against, with a mix of related and unrelated topics.  
comparison_sentences = [  
    "A kangaroo hops across the Australian Outback.",  
    "Cherry blossoms bring spring to life in Kyoto, Japan.",  
    "The Empire State Building lights up the Manhattan skyline.",  
]  

embeddings_ny = [generate_embedding(sentence) for sentence in ny_sentences]  
embeddings_comparison = [generate_embedding(sentence) for sentence in comparison_sentences] 

# Calculating and printing the cosine similarity scores.  
for i, (ny, comparison) in enumerate(zip(ny_sentences, comparison_sentences)):  
    similarity_score = cosine_similarity(embeddings_ny[i], embeddings_comparison[i])  
    print(f"NY: {ny}\nComparison: {comparison}\nScore: {similarity_score:.4f}\n") 

NY: The Empire State Building lights up the Manhattan skyline.
Comparison: A kangaroo hops across the Australian Outback.
Score: 0.7595

NY: The Empire State Building lights up the Manhattan skyline.
Comparison: Cherry blossoms bring spring to life in Kyoto, Japan.
Score: 0.8015

NY: The Empire State Building lights up the Manhattan skyline.
Comparison: The Empire State Building lights up the Manhattan skyline.
Score: 1.0000



# Vector Search

In [15]:
import json  
  
# Initialize an empty list to hold all the vectors  
dbpedia_vectors = []  
  
# Load in vectors for Wikipedia articles from a .jsonl file  
with open('dbpedia_samples_embeddings.jsonl', 'r', encoding='utf-8') as json_file:  
    for line in json_file:  
        # Each line is a separate JSON object  
        dbpedia_vectors.append(json.loads(line))  
  


In [38]:
import numpy as np  
import pandas as pd  
  
# Sample query  
query = "notable educational instituions"  
  
# Generate embedding for the query  
query_vector = generate_embedding(query)  
query_vector = np.array(query_vector)  
  
# Calculate the cosine similarity between the query and each vector  
similarity_scores = [cosine_similarity(query_vector, np.array(article['vector'])) for article in dbpedia_vectors]  
  
# Assuming you want to also capture the text associated with each vector  
results = [(article['text'], score) for article, score in zip(dbpedia_vectors, similarity_scores)]  
  
# Create a DataFrame to display the results  
df = pd.DataFrame(results, columns=['Text', 'Similarity Score'])  
  
# Sort the DataFrame by the similarity score in descending order  
df_sorted = df.sort_values(by='Similarity Score', ascending=False)  
  
# Display the top results  
print(df_sorted.head())  


                                                  Text  Similarity Score
123   Blackburn High School is a public secondary s...          0.770751
157   Michael E. DeBakey High School for Health Pro...          0.769030
121   Grand Canyon Preparatory Academy is a public ...          0.757726
101   The Chiltern School is a coeducational specia...          0.753215
155   The University of Nevada School of Medicine i...          0.748609
