# Appendix 2: Reliability to variation in the prompt

To test whether the LLM is stable to variations in the prompt instructions, we run the LLM with paraphrased version of the original annotation instructions, and then test the reliability using Krippendorf's alpha.

In [None]:
# This function takes a message and uses a call to the OpenAI API to annotate the message
def guess_tweet(tweet, model, temperature, instruction):
    """
    Generate a response to a tweet using a specified language model.

    Parameters:
    tweet (str): The tweet to be analyzed.
    model (str): The language model to use (e.g., 'gpt-4').
    temperature (float): The temperature setting for the model.
    instruction (str): The instruction to guide the model's response.

    This function attempts to generate a response to the given tweet by 
    repeatedly calling the OpenAI API. If an exception occurs (e.g., due to 
    API instability), it retries up to 50 times, waiting 10 seconds between 
    attempts. Once a response is successfully obtained, it concatenates the 
    content of all choices and returns the result.
    """
    response = None
    tries = 0
    failed = True

    # The API is at times unstable, so we catch exceptions and try repeatedly 
    while failed:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                temperature=temperature,
                messages=[
                    {"role": "system", "content": f"{instruction}"},  # The annotation prompt
                    {"role": "user", "content": f"'{tweet}'"}  # The message to annotate
                ]
            )
            failed = False  # Successfully obtained a response
            
        except Exception as e:
            print("Caught exception. Waiting...")
            print(e)
            failed = True
            tries += 1
            time.sleep(10)  # Wait 10 seconds before retrying

            if tries > 50:
                print("Too many failures. Giving up.")
                raise e  # Raise the exception if too many retries

    # Concatenate the content of all choices in the response
    result = ''
    for choice in response.choices:
        result += choice.message.content

    return result


def process_country(instruction, file, temperature, column, nr_runs, model='gpt-4'):
    """
    Process a dataset of text data and make predictions using a language model.

    Parameters:
    instruction (str): Instruction to be used for generating predictions.
    file (str): Path to the pickle file containing the dataset.
    temperature (float): Temperature setting for the language model.
    column (str): Name of the column in the dataset where predictions are stored.
    nr_runs (int): Number of predictions to generate for each entry.
    model (str): The model to be used for generating predictions (default is 'gpt-4').

    The function continuously samples rows from the dataset that have fewer 
    predictions than specified by nr_runs, generates a prediction using the 
    language model, appends the prediction to the column, and saves the 
    dataset back to the pickle file. If an error occurs during prediction, 
    it retries up to 10 times before stopping.
    """
    
    # Load the sample DataFrame from a pickle file
    sample = pd.read_pickle(file)
    errorcount = 0

    while True:
        # Filter rows that have fewer runs than the specified nr_runs
        left = sample.loc[sample[column].map(len) < nr_runs]
        print(f"There are {len(left)} left to process.")

        # If no rows are left to process, exit the loop
        if len(left) == 0:
            print("All done!")
            break

        # Randomly sample one row from the remaining rows
        line = left.sample()
        index = line.index.values[0]

        # Wait for a second before making the next API call
        time.sleep(1)
        try:
            # Call the guess_tweet function with the specified parameters
            guess = guess_tweet(line['text'].values[0], model=model, temperature=temperature, instruction=instruction)
            
            # Append the guess to the specified column of the DataFrame
            sample[column][index].append(guess)
            print(f"Guess is: {guess}")

            # Save the updated DataFrame back to the pickle file
            sample.to_pickle(file)

        except Exception as err:
            print(f"Error. Unexpected {err=}, {type(err)=}")        
            if errorcount < 10:
                # Increment the error count and continue trying if the error count is below 10
                errorcount += 1
                print("Error running. Will just keep trying though.")
                continue
            else:
                # If too many errors occur, stop the process and raise the exception
                print("Too many errors. Giving up.")
                raise


In [None]:
variation1 = "You will be given a tweet sent by a Senator in the United States. Your job is to say whether you think the Senator belongs to the Democratic or Republican party. The tweet was sent during the two months preceding the 2020 election in the United States. Respond ONLY 'Democrat' or 'Republican'"

FILE = 'US_sample_tweets_llm.pkl'
MODEL = 'gpt-4'
TEMP = 0.2
NR_RUNS = 1
COLUMN = 'gpt4_temp02_variation1'
INSTRUCTION = variation1

gdf = pd.read_pickle(FILE)
gdf[COLUMN] = [[] for i in range(len(gdf))]
gdf.to_pickle(FILE)

process_country(INSTRUCTION, FILE, TEMP, COLUMN, NR_RUNS, MODEL) 



In [None]:
variation2 = "Please classify the following Twitter post based on whether the author is a Republican or a Democratic Senator. The tweet was sent before the 2020 election, between September 3rd, 2020, and November 3rd, 2020. Respond ONLY 'Democrat' or 'Republican'"
INSTRUCTION = variation2

FILE = 'US_sample_tweets_llm.pkl'
MODEL = 'gpt-4'
TEMP = 0.2
NR_RUNS = 1
COLUMN = 'gpt4_temp02_variation2'
gdf = pd.read_pickle(FILE)
gdf[COLUMN] = [[] for i in range(len(gdf))]
gdf.to_pickle(FILE)

process_country(INSTRUCTION, FILE, TEMP, COLUMN, NR_RUNS, MODEL)



## Calculate Kripperdorff

In [None]:
!pip install simpledorff


Collecting simpledorff
  Downloading simpledorff-0.0.2-py3-none-any.whl (5.6 kB)
Installing collected packages: simpledorff
Successfully installed simpledorff-0.0.2


In [None]:
import simpledorff
import pandas as pd
import random
import numpy as np

In [None]:
# We create a bootstrap function to get confidence interval of the KA
def manual_bootstrap(df, experiment_col,annotator_col,class_col, ci=0.95, samplesize=300,iterations=1000):
    res = []
    for i in range(iterations):
        randomids = set(np.random.choice(df[experiment_col].unique(), samplesize, False))
        sample = df.loc[df[experiment_col].isin(randomids)]
        res.append(simpledorff.calculate_krippendorffs_alpha_for_df(sample,experiment_col=experiment_col,annotator_col=annotator_col,class_col=class_col))
    return np.mean(res),np.percentile(res,[100*(1-ci)/2,100*(1-(1-ci)/2)]) 

In [None]:
#LLM 0.2
data = pd.DataFrame([{'document_id': row['id'],'coder_id':i,'annotation':row['gpt4_temp02'][i]} for index, row in llm.iterrows() for i in range(5)])
llm02KA = simpledorff.calculate_krippendorffs_alpha_for_df(data,experiment_col='document_id',annotator_col='coder_id',class_col='annotation')
llm02KAci = manual_bootstrap(data,experiment_col='document_id',annotator_col='coder_id', class_col='annotation')
llm02interval = llm02KAci[0] - llm02KAci[1][0]

In [None]:
# LLM 1.0
data = pd.DataFrame([{'document_id': row['id'],'coder_id':i,'annotation':row['gpt4_temp10'][i]} for index, row in llm.iterrows() for i in range(5)])
llm10KA = simpledorff.calculate_krippendorffs_alpha_for_df(data,experiment_col='document_id',annotator_col='coder_id',class_col='annotation')
llm10KAci = manual_bootstrap(data,experiment_col='document_id',annotator_col='coder_id', class_col='annotation')
llm10interval = llm10KAci[0] - llm10KAci[1][0]

In [None]:
#LLM Variations
data = pd.DataFrame([{'document_id': row['id'],'coder_id':0,'annotation':row['gpt4_temp02'][0]} for index, row in llm.iterrows()]+[{'document_id': row['id'],'coder_id':1,'annotation':row['gpt4_temp02_variation1'][0]} for index, row in llm.iterrows()]+[{'document_id': row['id'],'coder_id':2,'annotation':row['gpt4_temp02_variation2'][0]} for index, row in llm.iterrows()])
llm02varKA = simpledorff.calculate_krippendorffs_alpha_for_df(data,experiment_col='document_id',annotator_col='coder_id',class_col='annotation')
llm02varKAci = manual_bootstrap(data,experiment_col='document_id',annotator_col='coder_id', class_col='annotation')
llm02varinterval = llm02varKAci[0] - llm02varKAci[1][0]

In [None]:
#Mturk
mturkKA = simpledorff.calculate_krippendorffs_alpha_for_df(mturk,experiment_col='id',annotator_col='workerid',class_col='answer')
mturkKAci = manual_bootstrap(mturk,experiment_col='id',annotator_col='workerid',class_col='answer')
mturkinterval = mturkKAci[0] - mturkKAci[1][0]

In [None]:
#Experts
expertKA = simpledorff.calculate_krippendorffs_alpha_for_df(experts,experiment_col='id',annotator_col='expert',class_col='answer')
expertsKAci = manual_bootstrap(experts,experiment_col='id',annotator_col='expert',class_col='answer')
expertinterval = expertsKAci[0] - expertsKAci[1][0]

In [None]:
# Plot the reliability

import matplotlib.pyplot as plt
import numpy as np

# Define the means and confidence intervals for the four groups
means = [llm02KA,llm10KA, llm02varKA, expertKA, mturkKA]
confidence_intervals = [llm02interval, llm10interval, llm02varinterval, expertinterval, mturkinterval]

# Define the x-axis labels for each group
x_labels = ['LLM t=0.2', 'LLM t=1.0', 'LLM t=0.2 Variations', 'Expert', 'MTurk' ]

# Set the figure size and dpi
fig, ax = plt.subplots(figsize=(7, 4), dpi=300)

colors = ['blue','orange','yellow','green','red']

for pos, y, err, colors in zip(x_labels, means, confidence_intervals, colors):
    ax.barh(pos, y, xerr=err, capsize = 4,  alpha=0.4, color = colors)

ax.tick_params(axis='both', which='major', labelsize=10)
ax.set_xlabel('Krippendorf\'s Alpha', fontsize=12)

ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)

# Set the padding between the plot and the edge of the figure
plt.tight_layout(pad=1)

# Show the plot
# plt.show()
plt.savefig('./figure_krippen.png',dpi=300)
plt.savefig('./figure_krippen.eps',dpi=300)
plt.savefig('./figure_krippen.pdf',dpi=300)