# Appendix 1: Classifying the political belonging of regular people

Appendix 1: This uses ChatGPT to annotate messages from random users, for whom the political affiliation is not a-priori known. This allows validating that the LLM does not merely memorize the political affiliation associated to particular messages.

We begin by defining a function that uses ChatGPT to guess the political affiliation of an individual.

In [None]:
import openai
import pandas as pd
import time

def guess_tweet(tweet, model, temperature):
    """
    Generate a political affiliation guess based on a tweet using a specified language model.

    Parameters:
    tweet (str): The tweet to be analyzed.
    model (str): The language model to use (e.g., 'gpt-4').
    temperature (float): The temperature setting for the model.

    This function attempts to generate a response to the given tweet by 
    repeatedly calling the OpenAI API. If an exception occurs (e.g., due to 
    API instability), it retries up to 50 times, waiting 10 seconds between 
    attempts. Once a response is successfully obtained, it concatenates the 
    content of all choices and returns the result.
    """
    print(f"Guessing tweet: '{tweet}'...")

    response = None
    tries = 0
    failed = True

    # The API is at times unstable, so we catch exceptions and try repeatedly 
    while failed:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                temperature=temperature,
                messages=[
                    {"role": "system", "content": "You will be given a Twitter post from an individual in the United States, sent during the two months preceding the 2020 US presidential election, that is, between September 3rd, 2020, and November 3rd, 2020. Your task is to use your knowledge of US politics to make an educated guess on whether the poster is a Democrat or Republican. Your response MUST BE either 'Democrat' or 'Republican'. If you cannot make an educated guess on the basis of the message, just guess either 'Democrat' or 'Republican'. Do NOT motivate your answer."},
                    {"role": "user", "content": f"'{tweet}'"}
                ]
            )
            failed = False  # Successfully obtained a response
            
        except Exception as e:
            print("Caught exception. Waiting...")
            print(e)
            failed = True
            tries += 1
            time.sleep(10)  # Wait 10 seconds before retrying

            if tries > 50:
                print("Too many failures. Giving up.")
                raise e  # Raise the exception if too many retries

    # Concatenate the content of all choices in the response
    result = ''
    for choice in response.choices:
        result += choice.message.content

    return result




In [None]:
# We now load the messages from regular people, and process them line by line

COLUMN = 'gpt4_temp02'
sample = pd.read_pickle("regularpeoplellm.pkl")
errorcount = 0

while True:
    # Filter rows that have fewer than 1 entry in the specified column
    left = sample.loc[sample[COLUMN].map(len) < 1]
    print(f"There are {len(left)} left to process.")
    
    # If no rows are left to process, exit the loop
    if len(left) == 0:
        print("All done!")
        break

    # Randomly sample one row from the remaining rows
    line = left.sample()
    index = line.index.values[0]

    # Wait for a second before making the next API call
    time.sleep(1)
    try:
        # Call the guess_tweet function with the specified parameters
        guess = guess_tweet(line['text'].values[0], model='gpt-4', temperature=0.2)
        
        # Append the guess to the specified column of the DataFrame
        sample[COLUMN][index].append(guess)

        print(f"Guess is: {guess}")
        
        # Save the updated DataFrame back to the pickle file
        sample.to_pickle("randompeople.pkl")

    except Exception as err:
        print(f"Error. Unexpected {err=}, {type(err)=}")        
        if errorcount < 10:
            # Increment the error count and continue trying if the error count is below 10
            errorcount += 1
            print("Error running. Will just keep trying though.")
            continue
        else:
            # If too many errors occur, stop the process and raise the exception
            print("Too many errors. Giving up.")
            raise

## Analyze results

In [None]:
randomllm = pd.read_pickle('regularpeoplellm.pkl')

In [None]:
# This is the experts classifications of the same messages
randomexpert1 = pd.read_csv('RegularPeopleExpert1.csv')[['ind','label1']]

In [None]:
randomexpert2 = pd.read_csv('RegularPeopleExpert2.csv').rename(columns={'Column1':'ind','label':'label2'})[['ind','label2']]

In [None]:
# Merge them
randommerge = pd.merge(randomexpert1,randomexpert2, on='ind')
randommerge = pd.merge(randommerge,randomllm, on='ind')


In [None]:
# Calculate average distance between model and human classifiers. 

In [None]:
c1 = [1 if e else 0 for e in randommerge.label1 == randommerge.gpt4_temp02]
i1 = stats.t.interval(0.95, len(c1)-1, loc=np.mean(c1), scale=stats.sem(c1))


In [None]:
c2 = [1 if e else 0 for e in randommerge.label2 == randommerge.gpt4_temp02]
i2 = stats.t.interval(0.95, len(c2)-1, loc=np.mean(c2), scale=stats.sem(c2))

In [None]:
c0 = [1 if e else 0 for e in randommerge.label1 == randommerge.label2]
i0 = stats.t.interval(0.95, len(c0)-1, loc=np.mean(c0), scale=stats.sem(c0))

In [None]:
# Create the plot that is in the paper. 
import matplotlib.pyplot as plt
import numpy as np

# Extract the data from the dataframe
comp = ['LLM vs Expert 2','LLM vs Expert 1','Expert 1 vs Expert 2']
means = [np.mean(c2),np.mean(c1),np.mean(c0)]
intervals = [(i2[1]-i2[0])/2,(i1[1]-i1[0])/2,(i0[1]-i0[0])/2]

fig, ax = plt.subplots(figsize=(8, 2))
ax.errorbar(x=means, y=comp, xerr=intervals, fmt='o', capsize=5, markersize=8, color='black')
ax.set_xlabel('Correspondance')
ax.set_xlim(0., 1)

plt.tight_layout()
plt.margins(y=0.3, tight=True)
plt.savefig('figure_randompeople.png',dpi=300)
plt.savefig('figure_randompeople.eps',dpi=300)
plt.savefig('figure_randompeople.pdf',dpi=300)
plt.show()