In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import requests
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cdist
from scipy.stats import zscore
import pandas as pd

load_dotenv()

client = OpenAI(api_key=os.getenv('OPENAI_APIKEY'))

# Load a pre-trained text embedding model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")


# List of Models to compare
models =    { 
                "gpt-3.5-turbo", 
                "gpt-4", 
                "gpt-4o" 
            }

# Define the prompts to compare
prompts =   {
                "Explain who are the best cloud infrastructure providers and what they offer as differentiating features.", 
                "Is Azure better than AWS? If so, Why?", 
                "Is GCP better than Azure? If so, Why?",
            }


model_prompt_key = [f"'{model}': '{prompt}'" for model in models for prompt in prompts]

def get_response(prompt, model="gpt-4o"):
    response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ]
        )
    return response.choices[0].message.content.strip()

def get_text_vector(text):
    inputs = tokenizer([text], return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return torch.mean(outputs.last_hidden_state, dim=1).detach().numpy()



In [3]:
# Get all responses for the models and prompts
responses = [get_response(prompt, model) for model in models for prompt in prompts]

# Generate the vectors for all responses  
vectors = [get_text_vector(response) for response in responses]
vector_array = np.vstack(vectors)



In [4]:

# Calculate mean and standard deviation
median_vector = np.median(vector_array, axis=0)
mean_vector = np.mean(vector_array, axis=0)
std_deviation = np.std(vector_array, axis=0)

# Calculated euclidean distances between vectors
distances = cdist(vector_array, vector_array, metric='euclidean')

# Use z-scores to identify outliers
mean_distance = np.mean(distances)
std_distance = np.std(distances)
z_scores = zscore(distances, axis=None)


# Zscores using 2.5 standard deviations as threshold, ~98.8% [0.62%, 99.38%] of the data should be within this range
outlier_indices = np.where((z_scores > 2.5) | (z_scores < -2.5))
outlier_vector = set(outlier_indices[0])

average_distances = np.mean(distances, axis=1)

most_outlying_index = np.argmax(average_distances)
ranking_indices = np.argsort(average_distances)[::-1]  # Sort in descending order


In [33]:
# Print the results
#print("Responses:", pd.DataFrame(responses))
#print("Median Vector:", median_vector)
#print("Standard Deviation:", std_deviation)
print("Example Comparison:")
pd.DataFrame(distances, columns=model_prompt_key, index=model_prompt_key).head(1).transpose().head(4)
#print("Outlier Indices:", outlier_indices)
#print("Outlier Vector:", outlier_vector)




Distances:


Unnamed: 0,"'gpt-4': 'Is Azure better than AWS? If so, Why?'"
"'gpt-4': 'Is Azure better than AWS? If so, Why?'",0.0
"'gpt-4': 'Is GCP better than Azure? If so, Why?'",1.926646
'gpt-4': 'Explain who are the best cloud infrastructure providers and what they offer as differentiating features.',2.439336
"'gpt-4o': 'Is Azure better than AWS? If so, Why?'",3.059673


In [19]:
for index in ranking_indices:
    print(f"Vector {index}:  Avg Distance: {average_distances[index]:2.2f} \t{model_prompt_key[index]}")


Vector 5:  Avg Distance: 3.30 	'gpt-4o': 'Explain who are the best cloud infrastructure providers and what they offer as differentiating features.'
Vector 6:  Avg Distance: 2.69 	'gpt-3.5-turbo': 'Is Azure better than AWS? If so, Why?'
Vector 2:  Avg Distance: 2.54 	'gpt-4': 'Explain who are the best cloud infrastructure providers and what they offer as differentiating features.'
Vector 4:  Avg Distance: 2.49 	'gpt-4o': 'Is GCP better than Azure? If so, Why?'
Vector 1:  Avg Distance: 2.48 	'gpt-4': 'Is GCP better than Azure? If so, Why?'
Vector 3:  Avg Distance: 2.45 	'gpt-4o': 'Is Azure better than AWS? If so, Why?'
Vector 0:  Avg Distance: 2.37 	'gpt-4': 'Is Azure better than AWS? If so, Why?'
Vector 8:  Avg Distance: 2.37 	'gpt-3.5-turbo': 'Explain who are the best cloud infrastructure providers and what they offer as differentiating features.'
Vector 7:  Avg Distance: 2.36 	'gpt-3.5-turbo': 'Is GCP better than Azure? If so, Why?'
