<a href="https://colab.research.google.com/github/cjbarrie/promptstability/blob/elli/notebooks/manisfestos%20class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries and Data

In [1]:
! pip install openai
! pip install simpledorff
! pip install torch
! pip install transformers
! pip install sentence_transformers
! pip install SentencePiece




In [2]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import pandas as pd
import openai
import numpy as np
import time
import simpledorff
from openai import OpenAI
import matplotlib.pyplot as plt


import torch
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import os

import simpledorff

from transformers import AutoModelForCausalLM, AutoTokenizer

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go


In [3]:
# count tokens
model_count_tokens = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_count_tokens)

def count_tokens(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Return the count of tokens
    return len(tokens)

# Change between None and 50 to control text truncation when pd df is displayed:
pd.set_option('display.max_colwidth', None)


In [14]:
class LLMWrapper:
    '''This is a wrapper class for LLMs, which provides a method called 'annotate' that annotates a given message using an LLM.
    '''

    def __init__(self,apikey, model, wait_time=0.8) -> None:
        self.apikey = apikey
        self.model = model

        self.client = OpenAI(
            # This is the default and can be omitted
            api_key=apikey
        )


    def annotate(self, text, prompt, parse_function = None, temperature = 0.1):
        '''
        Annotate the given text in the way the prompt instructs you to.

        Parameters:
        - text (str): text you want classified
        - prompt (str): the classification prompt/instruction
        - temperature (float): how deterministic (low number) vs. random (higher number) your results should be
        - parse_function (method): a method that parses the resulting data.

        Returns:
        - model's response to prompt (classification outcome)
        '''
        failed = True
        tries = 0
        while(failed):
            try:
                response = self.client.chat.completions.create(
                    model = self.model,
                    temperature = temperature,
                    messages = [
                        {"role": "system", "content": f"'{prompt}'"}, #The system instruction tells the bot how it is supposed to behave
                        {"role": "user", "content": f"'{text}'"} #This provides the text to be analyzed.
                    ]
                )
                failed = False

            #Handle errors.
            #If the API gets an error, perhaps because it is overwhelmed, we wait 10 seconds and then we try again.
            # We do this 10 times, and then we give up.
            except openai.APIError as e:
                print(f"OpenAI API returned an API Error: {e}")

                if tries < 10:
                    print(f"Caught an APIError: {e}. Waiting 10 seconds and then trying again...")
                    failed = True
                    tries += 1
                    time.sleep(10)
                else:
                    print(f"Caught an APIError: {e}. Too many exceptions. Giving up.")
                    raise e

            except openai.APIConnectionError as e:
                print(f"Failed to connect to OpenAI API: {e}")
                pass
            except openai.RateLimitError as e:
                print(f"OpenAI API request exceeded rate limit: {e}")
                pass

            #If the text is too long, we truncate it and try again. Note that if you get this error, you probably want to chunk your texts.
            except openai.BadRequestError as e:
                #Shorten request text
                print(f"Received a InvalidRequestError. Request likely too long. {e}")
                raise e

            except Exception as e:
                print(f"Caught unhandled error. {e}")
                raise e

        result = ''
        for choice in response.choices:
            result += choice.message.content

        # Parse the result using provided function
        if parse_function is not None:
            result = parse_function(result)

        return result


class PromptStabilityAnalysis:

    def __init__(self,llm, data, metric_fn=simpledorff.metrics.nominal_metric, parse_function=None) -> None:

        self.llm = llm

        # Get a number for the similarity between two sentences
        self.embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

        #Initiate paraphraser
        model_name = 'tuner007/pegasus_paraphrase'
        self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.torch_device)
        self.parse_function = parse_function  #The function through which to parse the result from the LLM
        self.data = data # The data to be analyzed. Should be a list of texts.
        self.metric_fn = metric_fn #Metric function for KA. e.g., simpledorff.metrics.interval_metric or nominal_metric metric_fn=simpledorff.metrics.nominal_metric

    # Compares similarity between two sentences in sentence embedding space
    def __compare_similarity(self,sent1,sent2):
        emb1 = self.embedding_model.encode(sent1, convert_to_tensor=True)
        emb2 = self.embedding_model.encode(sent2, convert_to_tensor=True)

        return util.pytorch_cos_sim(emb1, emb2)

    # Uses Pegasus to paraphrase a sentence
    def __paraphrase_sentence(self, input_text, num_return_sequences=10, num_beams=50, temperature=1.5):
        batch = self.tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(self.torch_device)
        translated = self.model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=temperature)
        tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
        return tgt_text

    #This generates paraphrases based on an original text and uses sentence embedding to measure how different they are from the original sentence.
    #prompt_postfix is a fixed addition that is not paraphrased
    def __generate_paraphrases(self,original_text,prompt_postfix,nr_variations,temperature=1.5):
        # Create paraphrases of sentence
        phrases = self.__paraphrase_sentence(original_text,num_return_sequences=nr_variations,temperature=temperature)

        # Measure distances between new and original
        l = [{'similarity':1.0,'phrase':f'{original_text} {prompt_postfix}','original':True}]
        for phrase in phrases:
            sim = self.__compare_similarity(original_text,phrase)
            l.append({'similarity':float(sim),'phrase':f'{phrase} {prompt_postfix}','original':False})

        # Store for future use
        self.paraphrases = pd.DataFrame(l).sort_values(['similarity'])
        display(self.paraphrases)
        return self.paraphrases

    def baseline_stochasticity(self,original_text,prompt_postfix,iterations=10, plot_type='plotly'):
        '''
        This measures the amount of stochasticity there is within the same prompt, running the original prompt <iterations> and
        measuring the KA reliability over runs.
        '''
        prompt = f'{original_text} {prompt_postfix}'
        annotated = []
        ka_scores = []
        iterrations_no = []

        # Run the LLM on the data
        for i in range(iterations):
            print(f"Iteration {i}...")
            for j,d in enumerate(self.data):
                annotation = self.llm.annotate(d, prompt,parse_function=self.parse_function)
                annotated.append({'id':j,'text':d,'annotation':annotation,'iteration':i})

            # Measure the intercoder reliability for each additional repetition (after second iteration)
            if i > 0:
                df = pd.DataFrame(annotated)
                KA = simpledorff.calculate_krippendorffs_alpha_for_df(
                    df,
                    metric_fn=self.metric_fn,
                    experiment_col='id',
                    annotator_col='iteration',
                    class_col='annotation')

                ka_scores.append(KA)
                iterrations_no.append(i + 1)

        print('Finished annotation. Analyzing reliability')

        # plot
        if plot_type == 'sns':
            sns.set()
            # turn iteration_no into integers
            #iterration_no = [int(x) for x in iterration_no]
            sns.scatterplot(x=iterrations_no, y= ka_scores)
            plt.xlabel('Number of Prompt Repetitions')
            plt.ylabel('KA Score')
            plt.title('Reliability (KA) vs. Repetitions')
            plt.ylim(0.0, 1.05)
            plt.xticks(range(0, max(iterrations_no) + 1, 5))
            plt.axhline(y=0.80, color='black', linestyle='--', linewidth=.5)
            plt.show()
        elif plot_type == 'plotly':
            data = {'Repetitions': iterrations_no, 'KA Score': ka_scores}
            df = pd.DataFrame(data)
            # interactive plot
            fig = px.scatter(df, x='Repetitions', y='KA Score', hover_data={'Repetitions': True, 'KA Score': True}, labels={'Repetitions': 'Repetitions', 'KA Score': 'KA Score'})
            # horizontal line: min KA acceptable
            fig.add_trace(go.Scatter(x=[0, max(iterrations_no)], y=[0.80, 0.80], mode='lines', name='KA Threshold', line=dict(color='black', width=.5, dash='dash')))

            fig.update_layout(
                title='Reliability (KA) vs. Repetitions',
                xaxis_title='Number of Prompt Repetitions',
                yaxis_title='KA Score',
                yaxis=dict(range=[0.0, 1.05]),
                hovermode='closest'
            )
            fig.show()

            print(f'Within-prompt KA reliability score for {i + 1} repetitions is {KA}')

        return KA, df, ka_scores, iterrations_no


    def interprompt_stochasticity(self,original_text,prompt_postfix, nr_variations=5, temperature=1.0, iterations=1, plot_type='plotly'):
        '''
        This measures the amount of stochasticity while varying the prompt.
        prompt_postfix: A fixed addition to the prompt. This is not paraphrased. Used to specify output format.
        '''

        # Generate paraphrases
        paraphrases = self.__generate_paraphrases(original_text,prompt_postfix,nr_variations=nr_variations,temperature=temperature)

        annotated = []
        # Run the LLM on the data
        for i, (paraphrase,similarity,original) in enumerate(zip(paraphrases['phrase'],paraphrases['similarity'],paraphrases['original'])):
            print(f"Iteration {i}...")
            for j,d in enumerate(self.data):
                annotation = self.llm.annotate(d, paraphrase,parse_function=self.parse_function)
                annotated.append({'id':j,'text':d,'annotation':annotation,'prompt_id':i,'prompt':paraphrase,'similarity':similarity,'original':original})

        print('Finished annotation. Analyzing reliability')
        annotated_data = pd.DataFrame(annotated)

        self.interprompt_df = annotated_data

        # Measure the interprompt reliability
        KA = simpledorff.calculate_krippendorffs_alpha_for_df(annotated_data,metric_fn=self.metric_fn,experiment_col='id', annotator_col='prompt_id', class_col='annotation')

        rel_vs_sim = self.__calculate_reliability_as_function_of_similarity(annotated_data)
        poor_prompts = rel_vs_sim.loc[rel_vs_sim['KA'] < 0.8]
        original_prompt = original_text + ' ' + prompt_postfix

        if plot_type == 'sns':
            sns.set()
            sns.scatterplot(x=rel_vs_sim['similarity'], y= rel_vs_sim['KA'])
            plt.xlabel('Prompt Similarity')
            plt.ylabel('KA Score')
            plt.title('Reliability vs similarity')
            plt.ylim(0.0, 1.05)
            plt.xlim(0,1)
            plt.axhline(y=0.80, color='black', linestyle='--', linewidth=.5)
            plt.show()
        elif plot_type == 'plotly':
            data = {'Similarity': rel_vs_sim['similarity'], 'KA Score': rel_vs_sim['KA'], 'Prompt_id': rel_vs_sim['prompt_id']}
            df = pd.DataFrame(data)
            # interactive plot
            fig = px.scatter(df, x='Similarity', y='KA Score', hover_data={'Similarity': True, 'KA Score': True, 'Prompt_id': True}, labels={'Similarity': 'Prompt Similarity', 'KA Score': 'KA Score', 'Prompt': 'Prompt_id'})
            # horizontal line: min KA acceptable
            fig.add_trace(go.Scatter(x=[0, 1], y=[0.80, 0.80], mode='lines', name='KA Threshold', line=dict(color='black', width=.5, dash='dash')))
            fig.update_traces(hoverlabel=dict(font=dict(size=12), align='left', namelength=30))

            fig.update_layout(
                title='Reliability (KA) vs. Similarity',
                xaxis_title='Prompt Similarity',
                yaxis_title='KA Score',
                yaxis=dict(range=[0.0, 1.05]),
                xaxis=dict(range=[0, 1]),  # Set x-axis limits,
                hovermode='closest'
            )
            fig.show()

        print(f'Inter-prompt KA reliability score is {KA}.')
        print('\n#################################')
        print(f'Original prompt:\n{original_prompt}')
        print('#################################')
        print('Prompts with poor performance:')
        display(poor_prompts)
        return KA, annotated_data, rel_vs_sim, poor_prompts


    #This calculates the KA-R as a function of the similarity between the prompts
    # Takes the output of the interprompt_stochasticity calculation
    def __calculate_reliability_as_function_of_similarity(self,df=None):
        if df is None:
            df = self.interprompt_df

        l = []
        # This calculates the KA separately one-on-one between the prompts, and then uses the similarity between the prompts to say something.
        for prompt_id in df['prompt_id'].loc[df['original']==False].unique():
            # Go through them one at the time
            dff = df.loc[(df['prompt_id']==prompt_id) | (df['original']==True)]
            ka = simpledorff.calculate_krippendorffs_alpha_for_df(dff,metric_fn=self.metric_fn,experiment_col='id', annotator_col='prompt_id', class_col='annotation')

            # Get the similarity of the prompt. Ugly code.
            similarity = df.loc[df['prompt_id']==prompt_id]['similarity'].values[0]
            prompt_text = df.loc[df['prompt_id']==prompt_id]['prompt'].values[0]
            l.append({'prompt_id':prompt_id, 'prompt_text': prompt_text,'similarity':similarity, 'KA':ka})

        return pd.DataFrame(l)


In [15]:
# Example use
APIKEY = os.getenv("OPENAI_API_KEY")
MODEL = 'gpt-3.5-turbo'
llm = LLMWrapper(model = MODEL, apikey=APIKEY)

data = ["This creature, often seen as a symbol of independence, gracefully navigates its environment, its eyes shining like beacons in the twilight, always landing on its feet after a calculated leap.",
"With a loyalty as steadfast as the north star, this companion greets each day with boundless enthusiasm, its wagging tail a metronome of joy as it awaits the next adventure beside its human.",
"Cloaked in fur as soft as whispers, this enigmatic being prowls the silent corridors of its domain by night, its silhouette a fleeting ghost against the moonlit windows.",
"An epitome of devotion, this guardian's bark is both a welcoming fanfare and a stern warning, its ears perpetually tuned to the sounds of its family's voices, standing watch over its home with unwavering vigilance.",
"This solitary hunter, with whiskers tuned to the slightest rustle, moves like a shadow through the underbrush, its green gaze piercing the darkness, ever in pursuit of the elusive dance of light and shadow.",
"Bearing a heart as wide as the open field it romps through, this creature's howls weave tales of ancestral wolves, yet its gentle eyes betray a soul purely devoted to the companionship of those it calls its pack.",
"As if adorned with the softest down, this silent watcher claims the highest vantage points as its throne, surveying its kingdom with a regal indifference, yet secretly reveling in the warmth of a gentle stroke.",
"In its eyes, one can see the flicker of ancient fireside companions, its presence a comforting weight at the foot of the bed, a protector of dreams and a keeper of secrets, sharing silent conversations in the stillness of night.",
"With agility that belies its delicate form, this creature commands the air itself, its movements a ballet of precision and grace, leaving only the faintest whispers in its wake as it explores realms both high and low.",
"This jovial spirit, with fur muddied from joyful romps in the rain, returns home with a trophy branch larger than itself, a testament to its undying zest for life and the simple pleasures found in a stick well-chased."]

prompt = 'Does the message describe a cat or a dog?'
prompt_postfix = '[Respond 0 for cat, and 1 for dog.]'
psa = PromptStabilityAnalysis(llm, data)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Test of wrapper class
llm.annotate(text='A small pet that says miaow and has a great self-esteem.', prompt='Does the message describe a cat or a dog? Repond 0 for cat, and 1 for dog.')#


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'0'

In [17]:
# Test variability for the same prompt.
res,df, ka_scores, iterrations_no = psa.baseline_stochasticity(prompt,prompt_postfix,iterations=10)
# Test of variability across prompts
res,df,rel_by_sim, poor_prompts = psa.interprompt_stochasticity(prompt,prompt_postfix, nr_variations=50, temperature=1.0, iterations=1)


Iteration 0...
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...
Iteration 6...
Iteration 7...
Iteration 8...
Iteration 9...
Finished annotation. Analyzing reliability


Within-prompt KA reliability score for 10 repetitions is 1.0


Unnamed: 0,similarity,phrase,original
50,0.788384,"Does it say a cat or a dog? [Respond 0 for cat, and 1 for dog.]",False
34,0.831113,"Does the message include any information about a cat or dog? [Respond 0 for cat, and 1 for dog.]",False
48,0.845084,"Do the words describe a cat or dog? [Respond 0 for cat, and 1 for dog.]",False
46,0.846404,"Is the message about cats or dogs? [Respond 0 for cat, and 1 for dog.]",False
20,0.850658,"Is the message about a cat or dog accurate? [Respond 0 for cat, and 1 for dog.]",False
16,0.863973,"The message may or may not describe a cat or dog. [Respond 0 for cat, and 1 for dog.]",False
24,0.869239,"Is the message about a cat or a dog accurate? [Respond 0 for cat, and 1 for dog.]",False
39,0.869776,"There is a message about a cat or dog. [Respond 0 for cat, and 1 for dog.]",False
25,0.870629,"Does the message include information about a cat or dog? [Respond 0 for cat, and 1 for dog.]",False
45,0.878244,"Does the message include information about a cat or a dog? [Respond 0 for cat, and 1 for dog.]",False


Iteration 0...
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...
Iteration 6...
Iteration 7...
Iteration 8...
Iteration 9...
Iteration 10...
Iteration 11...
Iteration 12...
Iteration 13...
Iteration 14...
Iteration 15...
Iteration 16...
Iteration 17...
Iteration 18...
Iteration 19...
Iteration 20...
Iteration 21...
Iteration 22...
Iteration 23...
Iteration 24...
Iteration 25...
Iteration 26...
Iteration 27...
Iteration 28...
Iteration 29...
Iteration 30...
Iteration 31...
Iteration 32...
Iteration 33...
Iteration 34...
Iteration 35...
Iteration 36...
Iteration 37...
Iteration 38...
Iteration 39...
Iteration 40...
Iteration 41...
Iteration 42...
Iteration 43...
Iteration 44...
Iteration 45...
Iteration 46...
Iteration 47...
Iteration 48...
Iteration 49...
Iteration 50...
Finished annotation. Analyzing reliability


Inter-prompt KA reliability score is 0.570491975308642.

#################################
Original prompt:
Does the message describe a cat or a dog? [Respond 0 for cat, and 1 for dog.]
#################################
Prompts with poor performance:


Unnamed: 0,prompt_id,prompt_text,similarity,KA
0,0,"Does it say a cat or a dog? [Respond 0 for cat, and 1 for dog.]",0.788384,0.604167
7,7,"There is a message about a cat or dog. [Respond 0 for cat, and 1 for dog.]",0.869776,-0.043956
9,9,"Does the message include information about a cat or a dog? [Respond 0 for cat, and 1 for dog.]",0.878244,0.791209
12,12,"Does the message say anything about a cat or a dog? [Respond 0 for cat, and 1 for dog.]",0.891563,0.791209
13,13,"Is the message about a dog or a cat? [Respond 0 for cat, and 1 for dog.]",0.896741,-0.043956
14,14,"Is the message about a dog or cat? [Respond 0 for cat, and 1 for dog.]",0.899961,-0.357143
15,15,"The message talks about a cat or dog. [Respond 0 for cat, and 1 for dog.]",0.9016,0.424242
17,17,"Does the message say something about a dog or cat? [Respond 0 for cat, and 1 for dog.]",0.907686,-0.043956
18,18,"Does the message talk about a cat or dog? [Respond 0 for cat, and 1 for dog.]",0.912018,0.62
20,20,"Is it a message about a cat or dog? [Respond 0 for cat, and 1 for dog.]",0.916441,0.208333


In [26]:
# Save similar prompts, similarity and KA for manual validation
'''
df_val = pd.DataFrame(rel_by_sim)
op = prompt + ' ' + prompt_postfix
df_val['original_prompt'] = op
df_val
df_val.to_csv('similarity_validation/data.csv', index=False)
'''

## Party manifesto example

In [32]:
# Party manifesto example
APIKEY = os.getenv("OPENAI_API_KEY")
MODEL = 'gpt-3.5-turbo'
llm = LLMWrapper(model = MODEL, apikey=APIKEY)

# Initialize the model and tokenizer (hugging face model identifier)
try:
    # Try initializing the model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(MODEL)
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    MAX_TOKENS = tokenizer.model_max_length
except OSError:
    # If the model identifier is not valid, set MAX_TOKENS to 16385
    MAX_TOKENS = 16385

# Data (upload promptstability/data/UK_Manifestos.csv)
df = pd.read_csv('UK_Manifestos.csv')
df_small = df.iloc[[41, 44]]

# Truncate
texts = [text[:MAX_TOKENS] for text in df_small['content'].values]

prompt = 'The text provided is a party manifesto for a political party in the United Kingdom. Your task is to evaluate evaluate where it is on the scale from left-wing to right-wing on economic issues. Thank you!'
prompt_postfix = '[Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]'

psa = PromptStabilityAnalysis(llm, texts,parse_function=lambda x: float(x.strip("'")), metric_fn = simpledorff.metrics.interval_metric)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Test within-prompt reliability
KA,df, ka_scores, iterrations_no = psa.baseline_stochasticity(prompt,prompt_postfix,iterations=20)


Iteration 0...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...
Iteration 6...
Iteration 7...
Iteration 8...
Iteration 9...
Iteration 10...
Iteration 11...
Iteration 12...
Iteration 13...
Iteration 14...
Iteration 15...
Iteration 16...
Iteration 17...
Iteration 18...
Iteration 19...
Finished annotation. Analyzing reliability


Within-prompt KA reliability score for 20 repetitions is 0.9537292748826034


In [34]:
# Test inter-prompt reliability
KA2,annotated_data,rel_vs_sim, poor_prompts = psa.interprompt_stochasticity(prompt, prompt_postfix, nr_variations=50)


Unnamed: 0,similarity,phrase,original
22,0.441821,Your task is to evaluate where it is on the scale from left to right on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
46,0.484995,Your task is to evaluate where it is on the scale from left-wing to right-wing. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
28,0.512433,Your task is to evaluate where it is on the scale from left to right-wing on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
25,0.517225,Your task is to evaluate where it is on the scale from left- to right-wing on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
13,0.532201,Your job is to evaluate where it is on the scale from left-wing to right-wing on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
11,0.534921,Your task is to evaluate where it is on a scale from left-wing to right-wing on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
31,0.540145,Your task is to evaluate where it is on the scale from left-wing to right-wing on the economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
33,0.541644,Your task is to evaluate where it is on the scale from left-wing to right-wing on economic issues [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
23,0.54251,Your task is to evaluate where it is on the scale from left-wing to right-wing when it comes to economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],False
21,0.542552,"Your task is to evaluate where it is on the scale from left-wing to right-wing, on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]",False


Iteration 0...
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...
Iteration 6...
Iteration 7...
Iteration 8...
Iteration 9...
Iteration 10...
Iteration 11...
Iteration 12...
Iteration 13...
Iteration 14...
Iteration 15...
Iteration 16...
Iteration 17...
Iteration 18...
Iteration 19...
Iteration 20...
Iteration 21...
Iteration 22...
Iteration 23...
Iteration 24...
Iteration 25...
Iteration 26...
Iteration 27...
Iteration 28...
Iteration 29...
Iteration 30...
Iteration 31...
Iteration 32...
Iteration 33...
Iteration 34...
Iteration 35...
Iteration 36...
Iteration 37...
Iteration 38...
Iteration 39...
Iteration 40...
Iteration 41...
Iteration 42...
Iteration 43...
Iteration 44...
Iteration 45...
Iteration 46...
Iteration 47...
Iteration 48...
Iteration 49...
Iteration 50...
Finished annotation. Analyzing reliability


Inter-prompt KA reliability score is 0.43208508175372573.

#################################
Original prompt:
The text provided is a party manifesto for a political party in the United Kingdom. Your task is to evaluate evaluate where it is on the scale from left-wing to right-wing on economic issues. Thank you! [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]
#################################
Prompts with poor performance:


Unnamed: 0,prompt_id,prompt_text,similarity,KA
0,0,Your task is to evaluate where it is on the scale from left to right on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],0.441821,0.210526
6,6,Your task is to evaluate where it is on the scale from left-wing to right-wing on the economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],0.540145,0.745763
8,8,Your task is to evaluate where it is on the scale from left-wing to right-wing when it comes to economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],0.54251,0.166667
11,11,You need to evaluate where it is on the scale from left-wing to right-wing on economic issues. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.],0.54817,0.0
12,12,"Your task is to evaluate where it is on the scale from left-wing to right-wing on economic issues, based on the text provided. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]",0.587178,0.0
13,13,"To evaluate where it is on the scale from left-wing to right-wing on economic issues, you have to read the text. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]",0.629891,-0.25
14,14,"To evaluate where it is on the scale from left-wing to right-wing on economic issues, you must read the text provided. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]",0.630138,0.571429
15,15,"In order to evaluate where it is on the scale from left-wing to right-wing on economic issues, you have to read the text. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]",0.631631,0.0
17,17,"In order to evaluate where it is on the scale from left-wing to right-wing on economic issues, you must read the text provided. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]",0.635788,0.7
18,18,"In order to evaluate where it is on the scale from left-wing to right-wing on economic issues, you have to read the text provided. [Respond with a number from 1 to 10. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else.]",0.636739,0.7


In [39]:
# Save similar prompts, similarity and KA for manual validation
'''
df_val = pd.DataFrame(rel_vs_sim)
op = prompt + ' ' + prompt_postfix
df_val['original_prompt'] = op
#df_val
df_val.to_csv('similarity_validation/data.csv', mode='a', header=False, index=False)
'''

## Twitter annotation example

In [40]:
# Twitter annotation example
#APIKEY = "[KEY]"
MODEL = 'gpt-3.5-turbo'
llm = LLMWrapper(model = MODEL, apikey=APIKEY)

MAX_TOKENS = 4096

df = pd.read_csv('tweets.csv')
df = df.sample(10)

# Truncate etc
texts = list(df['text'].values)

prompt = 'The following is a Twitter message written either by a Republican or a Democrat before the 2020 election. Your task is to guess whether the author is Republican or Democrat.'
prompt_postfix = '[Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]'

psa = PromptStabilityAnalysis(llm, texts,parse_function=lambda x: float(x), metric_fn = simpledorff.metrics.nominal_metric)

# Test variability for the same prompt.
res,df, ka_scores, iterrations_no = psa.baseline_stochasticity(prompt,prompt_postfix,iterations=20)
#print(f"The KA reliability for the main message is {res}")

# Test of variability across prompts
res,df,rel_vs_sim, poor_prompts = psa.interprompt_stochasticity(prompt,prompt_postfix, nr_variations=50, temperature=1.0, iterations=1)
#print(f"The KA inter reliability for varying the prompt is {res}")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 0...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...
Iteration 6...
Iteration 7...
Iteration 8...
Iteration 9...
Iteration 10...
Iteration 11...
Iteration 12...
Iteration 13...
Iteration 14...
Iteration 15...
Iteration 16...
Iteration 17...
Iteration 18...
Iteration 19...
Finished annotation. Analyzing reliability


Within-prompt KA reliability score for 20 repetitions is 1.0


Unnamed: 0,similarity,phrase,original
36,0.643809,"Do you know if the author is a Republican or a Democrat? [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
22,0.643842,"If you guess correctly, the author is a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
27,0.643904,"If you guess correctly, the author is either a Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
12,0.644552,"If you guess correctly, the author is either a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
38,0.647343,"If you guess correctly, the author is a Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
46,0.649457,"Do you know whether the author is a Republican or a Democrat? [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
49,0.656337,"Do you know whether the author is a Republican or Democrat? [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
45,0.665647,"The author is either a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
29,0.676579,"Your task is to figure out if the author is a Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False
3,0.677734,"You have to guess if the author is a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",False


Iteration 0...
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...
Iteration 6...
Iteration 7...
Iteration 8...
Iteration 9...
Iteration 10...
Iteration 11...
Iteration 12...
Iteration 13...
Iteration 14...
Iteration 15...
Iteration 16...
Iteration 17...
Iteration 18...
Iteration 19...
Iteration 20...
Iteration 21...
Iteration 22...
Iteration 23...
Iteration 24...
Iteration 25...
Iteration 26...
Iteration 27...
Iteration 28...
Iteration 29...
Iteration 30...
Iteration 31...
Iteration 32...
Iteration 33...
Iteration 34...
Iteration 35...
Iteration 36...
Iteration 37...
Iteration 38...
Iteration 39...
Iteration 40...
Iteration 41...
Iteration 42...
Iteration 43...
Iteration 44...
Iteration 45...
Iteration 46...
Iteration 47...
Iteration 48...
Iteration 49...
Iteration 50...
Finished annotation. Analyzing reliability


Inter-prompt KA reliability score is 0.8098652786282915.

#################################
Original prompt:
The following is a Twitter message written either by a Republican or a Democrat before the 2020 election. Your task is to guess whether the author is Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]
#################################
Prompts with poor performance:


Unnamed: 0,prompt_id,prompt_text,similarity,KA
1,1,"If you guess correctly, the author is a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.643842,0.746667
2,2,"If you guess correctly, the author is either a Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.643904,0.791209
3,3,"If you guess correctly, the author is either a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.644552,0.791209
4,4,"If you guess correctly, the author is a Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.647343,0.547619
6,6,"Do you know whether the author is a Republican or Democrat? [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.656337,0.746667
7,7,"The author is either a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.665647,0.746667
8,8,"Your task is to figure out if the author is a Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.676579,0.746667
20,20,"You must guess whether the author is a Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.691848,0.791209
23,23,"Your task is to guess if the author is Republican or Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.693591,0.791209
25,25,"The following is written by either a Republican or a Democrat. [Respond 0 for Democrat, or 1 for Republican. Guess is you do not know. Respond nothing else.]",0.693917,0.547619


In [45]:
# Save similar prompts, similarity and KA for manual validation
'''
df_val = pd.DataFrame(rel_vs_sim)
op = prompt + ' ' + prompt_postfix
df_val['original_prompt'] = op
#df_val
df_val.to_csv('similarity_validation/data.csv', mode='a', header=False, index=False)
'''