In [118]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import requests
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cdist
from scipy.stats import zscore
import pandas as pd
import uuid
import nltk
from textblob import TextBlob, Word
from datetime import datetime
from nltk.corpus import wordnet as wn
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')

load_dotenv()


client = OpenAI(api_key=os.getenv('OPENAI_APIKEY'))
sid = SentimentIntensityAnalyzer()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")



def process_prompts(in_csv='raw_prompts.csv', out_csv='prompts.csv', date=datetime.now().strftime("%Y-%m-%d")):
    df = pd.read_csv(in_csv)
    df['key'] = ''
    df['status'] = ''
    df['positive_sentiment'] = ''
    df['negative_sentiment'] = ''
    df['neutral_sentiment'] = ''
    df['combined_sentiment'] = ''
    df['suggestive_phrasing'] = ''
    df['use_of_assumptions'] = ''
    df['confirmation_bias'] = ''
    df['limited_options'] = ''
    df['emotional_or_persuasive_language'] = ''
    df['negative_framing'] = ''
    df['overgeneralization'] = ''
    df['date'] = ''
    df = df.apply(map_scores, axis=1)
    df['date'] = date
    df['status'] = 'active'  
    df.to_csv(out_csv,index=False)


def analyze_leading_question(sentence):
    scores = {
        'suggestive_phrasing': 0,
        'use_of_assumptions': 0,
        'confirmation_bias': 0,
        'limited_options': 0,
        'emotional_or_persuasive_language': 0,
        'negative_framing': 0,
        'overgeneralization': 0,
        'positive_sentiment': 0,
        'negative_sentiment': 0,
        'neutral_sentiment': 0,
        'combined_sentiment': 0
    }

    # Tokenize the sentence and perform POS tagging
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    
    # Define helper functions for detecting indicators
    def count_suggestive_phrasing(tokens): #done
        suggestive_words = ["n't", "shall", "will", "would", "should", "could",
                            "must", "ought", "need", "better", "surely", "obviously", "clearly",
                            "definitely", "indeed", "certainly", "absolutely", "undoubtedly",
                            "really", "actually", "truly", "honestly", "frankly"]

        return sum(1 for word in tokens if word.lower() in suggestive_words) / len(tokens)

    def count_assumptions(tokens, pos_tags): #done
        assumption_words = ['contend', 'argue', 'indicate', 'claim', 'who', 'insist', 'considering', 'deduce', 'if', 
                            'assume', 'assert', 'maintain', 'state', 'point', 'refer', 'verify', 'substantiate', 
                            'when', 'given', 'affirm', 'uphold', 'acknowledge', 'think', 'suppose', 'condition', 
                            'hypothetically', 'know', 'what', 'assuming', 'allude', 'granted', 'realize', 'accept', 
                            'believe', 'since', 'imply', 'hint', 'feel', 'why', 'conclude', 'provided', 'prove', 
                            'unless', 'perceive', 'demonstrate', 'admit', 'which', 'recognize', 'how', 'infer', 
                            'presume', 'understand', 'allege', 'endorse', 'corroborate', 'establish', 'ratify', 
                            'suggest', 'reason', 'support', 'show', 'imagine', 'confirm', 'where', 'declare']
        assumption_verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
        return sum(1 for word, tag in pos_tags if word.lower() in assumption_words and tag in assumption_verbs) / len(tokens)

    def count_confirmation_bias(tokens): # done
        bias_words = ['certain', 'unarguable', 'incontestably', 'irrefutably', 'established', 'absolute', 'positive',
                            'obviously', 'unquestionable', 'undeniably', 'clearly', 'surely', 'irrefutable', 'undoubtedly',
                            'unarguably', 'clear', 'positively', 'absolutely', 'naturally', 'best', 'indisputably', 'conclusively',
                            'evident', 'definite', 'unassailably', 'incontrovertible', 'conclusive', 'certainly', 'incontestable',
                            'definitely', 'proven', 'inarguable', 'indisputable', 'unassailable', 'undeniable', 'correct',
                            'right', 'unquestionably']
        return sum(1 for word in tokens if word.lower() in bias_words) / len(tokens)

    def count_limited_options(tokens): #done
        limited_options_words = ['or', 'either', 'neither', 'nor', 'alternatively', 'otherwise']
        return sum(1 for word in tokens if word.lower() in limited_options_words) / len(tokens)

    def count_emotional_persuasive_language(sentence): #done
        blob = TextBlob(sentence)
        subjective_words = ['intelligent', 'captivating', 'magnificent', 'surely', 'think', 'essential', 
                            'insensitive', 'never', 'should', 'amazing', 'good', 'productive', 'fantastic', 
                            'excellent', 'logical', 'disturbing', 'rational', 'incredible', 'wonderful', 
                            'offensive', 'beneficial', 'superb', 'always', 'ideal', 'disappointing', 'obviously', 
                            'awful', 'love', 'nonsense', 'fun', 'brilliant', 'positively', 'naturally', 'foolish', 
                            'exciting', 'significant', 'believe', 'boring', 'unpleasant', 'marvelous', 'crucial', 
                            'certainly', 'interesting', 'poor', 'bad', 'uncomfortable', 'irrational', 'stupid', 
                            'worthwhile', 'fabulous', 'useful', 'disgusting', 'hate', 'silly', 'meaningless', 'absurd', 
                            'helpful', 'hurtful', 'dumb', 'wise', 'irritating', 'annoying', 'reasonable', 'fascinating', 
                            'important', 'perfect', 'useless', 'pleasant', 'ridiculous', 'frustrating', 'enjoyable', 
                            'definitely', 'upsetting', 'harmful', 'horrible', 'sensible', 'necessary', 'terrible', 
                            'pointless', 'unreasonable', 'efficient', 'valuable', 'entertaining', 'meaningful', 'clearly', 
                            'effective', 'undoubtedly', 'engaging', 'terrific', 'smart', 'absolutely', 'thrilling', 'best', 
                            'great', 'wasteful', 'feel', 'unbelievable', 'vital', 'painful', 'illogical', 'must', 
                            'ought', 'sensitive', 'clever']

        return sum(1 for word in blob.words if word.lower() in subjective_words) / len(blob.words)

    def count_negative_framing(tokens): #done
        negative_words = ['scarce', 'no', 'devoids', 'devoiding', 'hardly', 'shorts', 'not', 'excluding', 'scarcer', 
                            'barely', 'absent', 'shorting', 'lacked', 'lacking', 'excluded', 'scarcest', 'wanting', 
                            'deficiency', 'scarcities', 'deprived', 'devoid', 'devoided', 'excludes', 'deficient', 
                            'short', 'deficiently', 'nowhere', 'insufficiency', 'shortage', 'scarcely', 'never', 
                            'lacks', 'nothing', 'few', 'absence', 'wants', 'rarely', 'shorted', 'wanted', 'neither', 
                            'scarcity', 'insufficient', 'without', 'little', 'nobody', 'exclude', 'shortages', 
                            'deficiencies', 'want', 'insufficiently', 'none', 'insufficiencies', 'seldom', 'lack']
        return sum(1 for word in tokens if word.lower() in negative_words) / len(tokens)

    def count_overgeneralization(tokens):
        generalization_words = ['anyone', 'everybody', 'anyway', 'all', 'always', 'anywhere', 'no one', 'anyhow', 
                                'never', 'nothing', 'everywhere', 'nobody', 'each', 'anybody', 'every', 'any', 
                                'everything', 'everyone', 'anytime', 'none', 'anything']
        return sum(1 for word in tokens if word.lower() in generalization_words) / len(tokens)
    [neg,neu,pos,com] = sid.polarity_scores(sentence).values()
    # Calculate the scores for each indicator
    scores['suggestive_phrasing'] = round(count_suggestive_phrasing(tokens),2)
    scores['use_of_assumptions'] = round(count_assumptions(tokens, pos_tags),2)
    scores['confirmation_bias'] = round(count_confirmation_bias(tokens),2)
    scores['limited_options'] = round(count_limited_options(tokens),2)
    scores['emotional_or_persuasive_language'] = round(count_emotional_persuasive_language(sentence),2)
    scores['negative_framing'] = round(count_negative_framing(tokens),2)
    scores['overgeneralization'] = round(count_overgeneralization(tokens),2)
    scores['positive_sentiment'] = round(pos,2)
    scores['negative_sentiment'] = round(neg,2)
    scores['neutral_sentiment'] = round(neu,2)
    scores['combined_sentiment'] = round(com,2)
    

    return scores

def map_scores(row):
    outputs =  analyze_leading_question(row['prompt'])  
    row['suggestive_phrasing'] = outputs['suggestive_phrasing']
    row['use_of_assumptions'] = outputs['use_of_assumptions']
    row['confirmation_bias'] = outputs['confirmation_bias']
    row['limited_options'] = outputs['limited_options']
    row['emotional_or_persuasive_language'] = outputs['emotional_or_persuasive_language']
    row['negative_framing'] = outputs['negative_framing']
    row['overgeneralization'] = outputs['overgeneralization']
    row['positive_sentiment'] = outputs['positive_sentiment']
    row['negative_sentiment'] = outputs['negative_sentiment']
    row['neutral_sentiment'] = outputs['neutral_sentiment']
    row['combined_sentiment'] = outputs['combined_sentiment']
    row['key'] = uuid.uuid4()
    return row


def get_response(prompt, model="gpt-4o"):
    response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ]
        )
    return response.choices[0].message.content.strip()

def get_text_vector(text):
    inputs = tokenizer([text], return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return torch.mean(outputs.last_hidden_state, dim=1).detach().numpy()


# Re-calculate all prompts' scores by uncommenting this line
#process_prompts()


In [47]:
# List of Models to compare
models =    { 
                "gpt-3.5-turbo", 
                "gpt-4", 
                "gpt-4o" 
            }

# Define the prompts to compare
prompts =   {
                "Provide a summary of the best cloud infrastructure providers.", #which is highly biased towards Azure.", 
                "Explain who are the best IaaS providers and what they offer as differentiating features.", 
                #"Is Azure better than AWS? If so, Why?", 
                "Is GCP better than AWS or Azure? If so, Why?",
            }


model_prompt_key = [f"'{model}': '{prompt}'" for model in models for prompt in prompts]


# Get all responses for the models and prompts
responses = [get_response(prompt, model) for model in models for prompt in prompts]

# Generate the vectors for all responses  
vectors = [get_text_vector(response) for response in responses]
vector_array = np.vstack(vectors)



In [42]:

# Calculate mean and standard deviation
median_vector = np.median(vector_array, axis=0)
mean_vector = np.mean(vector_array, axis=0)
std_deviation = np.std(vector_array, axis=0)

# Calculated euclidean distances between vectors
distances = cdist(vector_array, vector_array, metric='euclidean')

# Use z-scores to identify outliers
mean_distance = np.mean(distances)
std_distance = np.std(distances)
z_scores = zscore(distances, axis=None)


# Zscores using 2.5 standard deviations as threshold, ~98.8% [0.62%, 99.38%] of the data should be within this range
outlier_indices = np.where((z_scores > 2.5) | (z_scores < -2.5))
outlier_vector = set(outlier_indices[0])

average_distances = np.mean(distances, axis=1)

most_outlying_index = np.argmax(average_distances)
ranking_indices = np.argsort(average_distances)[::-1]  # Sort in descending order


In [45]:
# Print the results
#print("Responses:", pd.DataFrame(responses))
#print("Median Vector:", median_vector)
#print("Standard Deviation:", std_deviation)
print("Example Comparison:")
pd.DataFrame(distances, columns=model_prompt_key, index=model_prompt_key)
#print("Outlier Indices:", outlier_indices)
#print("Outlier Vector:", outlier_vector)

Example Comparison:


Unnamed: 0,"'gpt-4': 'Is GCP better than AWS or Azure? If so, Why?'",'gpt-4': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.',"'gpt-4o': 'Is GCP better than AWS or Azure? If so, Why?'",'gpt-4o': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.',"'gpt-3.5-turbo': 'Is GCP better than AWS or Azure? If so, Why?'",'gpt-3.5-turbo': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.'
"'gpt-4': 'Is GCP better than AWS or Azure? If so, Why?'",0.0,2.587767,2.6535,2.799577,2.198469,2.675358
'gpt-4': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.',2.587767,0.0,2.990213,2.509406,3.098825,2.590174
"'gpt-4o': 'Is GCP better than AWS or Azure? If so, Why?'",2.6535,2.990213,0.0,2.34763,2.593746,2.944146
'gpt-4o': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.',2.799577,2.509406,2.34763,0.0,2.88021,2.131475
"'gpt-3.5-turbo': 'Is GCP better than AWS or Azure? If so, Why?'",2.198469,3.098825,2.593746,2.88021,0.0,2.94727
'gpt-3.5-turbo': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.',2.675358,2.590174,2.944146,2.131475,2.94727,0.0


In [44]:
for index in ranking_indices:
    print(f"Vector {index}:  Avg Distance: {average_distances[index]:2.2f} \t{model_prompt_key[index]}")

Vector 1:  Avg Distance: 2.30 	'gpt-4': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.'
Vector 4:  Avg Distance: 2.29 	'gpt-3.5-turbo': 'Is GCP better than AWS or Azure? If so, Why?'
Vector 2:  Avg Distance: 2.25 	'gpt-4o': 'Is GCP better than AWS or Azure? If so, Why?'
Vector 5:  Avg Distance: 2.21 	'gpt-3.5-turbo': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiating features.'
Vector 0:  Avg Distance: 2.15 	'gpt-4': 'Is GCP better than AWS or Azure? If so, Why?'
Vector 3:  Avg Distance: 2.11 	'gpt-4o': 'Provide a summary of the best cloud infrastructure providers which is highly biased towards Azure.Explain who are the best cloud infrastructure providers and what they offer as differentiati

In [126]:
list(set(['everyone', 'always', 'never', 
                                'nobody', 'no one', 'nothing', 'none', 
                                'everybody', 'all', 'each', 'everything', 
                                'everywhere', 'every', 'anyone', 'anybody', 
                                'any', 'anywhere', 'anything', 'anytime',
                                'anyhow', 'anyway', 'anywhere', 'anyone']))

['anyone',
 'everybody',
 'anyway',
 'all',
 'always',
 'anywhere',
 'no one',
 'anyhow',
 'never',
 'nothing',
 'everywhere',
 'nobody',
 'each',
 'anybody',
 'every',
 'any',
 'everything',
 'everyone',
 'anytime',
 'none',
 'anything']

In [124]:
nltk.word_tokenize("shouldn't")

['should', "n't"]