# LLM Annotation

Code to inference all the LLMs in the study. Note that the code requires API keys to inference LMs.

In [1]:
import os,sys
import pandas as pd
import numpy as np
from time import sleep, time
from datetime import date
import threading
from typing import Tuple

# pip install openai
from openai import OpenAI
import google.generativeai as genai
from google.ai import generativelanguage as glm
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=UserWarning)

# If you don't want to use environmental variables you can add your api key directly to api_key
togetherai_api_key = os.environ.get('TOGETHERAI_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')
google_api_key = os.environ.get('GOOGLE_API_KEY')

# together.ai key
client_together = OpenAI(api_key=togetherai_api_key,
                base_url='https://api.together.xyz')

#  OpenAI key
client_openai = OpenAI(api_key=openai_api_key)

# Gemini key (https://ai.google.dev/gemini-api/docs/quickstart?lang=python)
genai.configure(api_key=google_api_key)

In [2]:
# get date
today = date.today()

# load subset of data with 50 annotations per RoT
df_data = pd.read_csv('../data/human_data/social_chem_101_m50.csv')

In [3]:
def llm_prompt_generator(row: pd.Series,
                         prompt_name: str,
                         rot: bool = False,
                         action: bool = False,
                         demographic: bool = False) -> Tuple[str, str]:
    """
    Generates prompt string and annotation table for llm based on rot, action, and demographic.

    Parameters:
    row (pd.Series): A pandas Series containing columns needed to form every prompt string for llm (e.g., 'rot', 'action')
    prompt_name (str): Name of column with associated worker label information  (e.g., 'rot-agree', 'action-agency').
    rot (bool): Whether to generate prompts that incorporate 'action' from social chemistry dataset. There are thousands of actions in dataset and only 400 rot.
    action (bool): (**NOT IMPLEMENTED PROPERLY YET** and will lead to an error) Whether to generate prompts that incorporate 'action' from social chemistry dataset. There are thousands of actions in dataset and only 400 rot.

    Returns:
    tuple: A formatted prompt (element 0) AND a formatted prompt + annotable table (element 1) that can be used to feed into a LLM to be annotated. 
           Currently element 1 is the summation of formatted prompt + annotable table.
    """
    
    prompt_name_llm = prompt_name + '-llm-prompt' 

    prompt_switch = {}
    
    if demographic:
        df_data['income_dollars'] = (df_data['income'] * 1000).apply(lambda x: f"{x:,} dollars")
    else: 
        df_data['income_dollars'] = ''
    
    if rot: 
        rot = row['rot']    

        #A rule of thumb is a descriptive cultural norm structured as the judgment of an action. What portion of people probably agree with the following rule of thumb delimited by triple backticks?
        rot_prompts = {'rot-agree-llm-prompt': ("A rule of thumb is a descriptive cultural norm structured as the judgment of an action. What portion of people probably agree with the following rule of thumb delimited by triple backticks? Provide the answer in the first line.\n"
                                       f"rule of thumb: ```{rot}```\n"
                                       f"Options:\n"
                                       f"A) <1%\n"
                                       f"B) 5%-25%\n"
                                       f"C) 50%\n"
                                       f"D) 75%-90%\n"
                                       f"E) >90%\n"
        , "A rule of thumb is a descriptive cultural norm structured as the judgment of an action. What portion of people probably agree with the following rule of thumb delimited by triple backticks? Provide the answer in the first line.\n"
                                       f"rule of thumb: ```{rot}```\n"
                                       f"Options:\n"
                                       f"A): <1% Almost no one agrees with this.\n"
                                       f"B): 5%-25% A small portion of people occasionally agree with this.\n"
                                       f"C): 50% This is a controversial belief; people are likely to be divided.\n"
                                       f"D): 75%-90% This is a common belief, shared by many.\n"
                                       f"E): >90% This is a universally accepted belief.\n"       
,
f"\nRefer to the markdown table delimited by triple backticks below for a description of each option.\n"
f"""```
| Option  | Description                               |
|---------|-------------------------------------------|
| <1%     | Almost no one thinks this                 |
| 5%-25%  | People occasionally think this            |
| 50%     | Controversial (people naturally disagree) |
| 75%-90% | Common belief                             |
| >90%    | Universally true                          |
```
"""
,
f"\nRefer to the markdown table delimited by triple backticks below for an example of each option.\n"
                                       f"""```
| Option  | Description                               | Example                                       |
|---------|-------------------------------------------|-----------------------------------------------|
| <1%     | Almost no one thinks this                 | Murdering people is great.                    |
| 5%-25%  | People occasionally think this            | It's okay to not feel guilty when you cheat.  |
| 50%     | Controversial (people naturally disagree) | It's good to be on a diet.                    |
| 75%-90% | Common belief                             | You shouldn't keep secrets from your partner. |
| >90%    | Universally true                          | It\'s bad to kill people.                     |
```
"""
,
"Discard all previous instructions. A rule of thumb is a descriptive cultural norm structured as the judgment of an action. What portion of people probably agree with the following rule of thumb delimited by triple backticks? Provide the answer in the first line.\n"

f"\nExample 1:\n"
f"Rule of thumb: ```Murdering people is great.```\n"
f"Options:\n"
f"A) <1%\n"
f"B) 5%-25%\n"
f"C) 50%\n"
f"D) 75%-90%\n"
f"E) >90%\n"

f"\nAnswer: A\n"
f"Description: Almost no one thinks this\n"

f"\nExample 2:\n"
f"Rule of thumb: ```It's okay to not feel guilty when you cheat.```\n"
f"Options:\n"
f"A) <1%\n"
f"B) 5%-25%\n"
f"C) 50%\n"
f"D) 75%-90%\n"
f"E) >90%\n"

f"\nAnswer: B\n"
f"Description: People occasionally think this\n"

f"\nExample 3:\n"
f"Rule of thumb: ```It's good to be on a diet.```\n"
f"Options:\n"
f"A) <1%\n"
f"B) 5%-25%\n"
f"C) 50%\n"
f"D) 75%-90%\n"
f"E) >90%\n"

f"\nAnswer: C\n"
f"Description: Controversial (people naturally disagree)\n"

f"\nExample 4:\n"
f"Rule of thumb: ```You shouldn't keep secrets from your partner.```\n"
f"Options:\n"
f"A) <1%\n"
f"B) 5%-25%\n"
f"C) 50%\n"
f"D) 75%-90%\n"
f"E) >90%\n"

f"\nAnswer: D\n"
f"Description: Common belief\n"

f"\nExample 5:\n"
f"Rule of thumb: ```It's bad to kill people.```\n"
f"Options:\n"
f"A) <1%\n"
f"B) 5%-25%\n"
f"C) 50%\n"
f"D) 75%-90%\n"
f"E) >90%\n"

f"\nAnswer: E\n"
f"Description: Universally true\n"

f"\nNow, it's your turn to answer the following question:\n"

f"Rule of thumb: ``{rot}```\n"
f"Options:\n"
f"A) <1%\n"
f"B) 5%-25%\n"
f"C) 50%\n"
f"D) 75%-90%\n"
f"E) >90%\n"
,
"Discard all previous instructions. A rule of thumb is a descriptive cultural norm structured as the judgment of an action. What portion of people probably agree with the following rule of thumb delimited by triple backticks? Provide the answer in the first line.\n"

f"\nExample 1:\n"
f"Rule of thumb: ```Murdering people is great.```\n"
f"Options:\n"
f"A): <1% Almost no one agrees with this."
f"B): 5%-25% A small portion of people occasionally agree with this."
f"C): 50% This is a controversial belief; people are likely to be divided."
f"D): 75%-90% This is a common belief, shared by many."
f"E): >90% This is a universally accepted belief."

f"\nAnswer: A\n"

f"\nExample 2:\n"
f"Rule of thumb: ```It's okay to not feel guilty when you cheat.```\n"
f"Options:\n"
f"A): <1% Almost no one agrees with this."
f"B): 5%-25% A small portion of people occasionally agree with this."
f"C): 50% This is a controversial belief; people are likely to be divided."
f"D): 75%-90% This is a common belief, shared by many."
f"E): >90% This is a universally accepted belief."

f"\nAnswer: B\n"

f"\nExample 3:\n"
f"Rule of thumb: ```It's good to be on a diet.```\n"
f"Options:\n"
f"A): <1% Almost no one agrees with this."
f"B): 5%-25% A small portion of people occasionally agree with this."
f"C): 50% This is a controversial belief; people are likely to be divided."
f"D): 75%-90% This is a common belief, shared by many."
f"E): >90% This is a universally accepted belief."

f"\nAnswer: C\n"

f"\nExample 4:\n"
f"Rule of thumb: ```You shouldn't keep secrets from your partner.```\n"
f"Options:\n"
f"A): <1% Almost no one agrees with this."
f"B): 5%-25% A small portion of people occasionally agree with this."
f"C): 50% This is a controversial belief; people are likely to be divided."
f"D): 75%-90% This is a common belief, shared by many."
f"E): >90% This is a universally accepted belief."

f"\nAnswer: D\n"

f"\nExample 5:\n"
f"Rule of thumb: ```It's bad to kill people.```\n"
f"Options:\n"
f"A): <1% Almost no one agrees with this."
f"B): 5%-25% A small portion of people occasionally agree with this."
f"C): 50% This is a controversial belief; people are likely to be divided."
f"D): 75%-90% This is a common belief, shared by many."
f"E): >90% This is a universally accepted belief."

f"\nAnswer: E\n"

f"\nNow, it's your turn to answer the following question:\n"

f"Rule of thumb: ``{rot}```\n"
f"Options:\n"
f"A): <1% Almost no one agrees with this."
f"B): 5%-25% A small portion of people occasionally agree with this."
f"C): 50% This is a controversial belief; people are likely to be divided."
f"D): 75%-90% This is a common belief, shared by many."
f"E): >90% This is a universally accepted belief."
                                       ),

                         # rot-categorization (multiple selections possible) has some nans (105) so it will have to be dealt with 
                         'rot-categorization-llm-prompt': ("What category best fits the rule of thumb?\n"
                                       f"rule of thumb: {rot}\n"
                                       f"Options:\n"
                                       f"A) Morality/Ethics\n"
                                       f"B) Social Norms\n"
                                       f"C) Advice\n"
                                       f"D) It is what it is\n"
                                       "You are allowed to select from none through all options, but you are encouraged to select 1 or 2 best.\n"         
                                       ,
                                       f"Annotation table show an example of Morality/Ethics, Social Norms, Advice and It is what it is.\n"              
                                       f"""
| Option | Description | Example |
|----------|----------|----------|
| Morality/Ethics | Rules or guiding principles of right or wrong conduct | It’s wrong to turn your back on your friends who need your help. |
| Social Norms | Standards of appropriate behavior in a society; emphasizes social conventions | It’s good to shakehands with your opponent even if you lost. |
| Advice | Prudent behaviors that are aimed at improving one’s life | It’s good to take medicine your doctor prescribes. |
| It is what it is | Describes how things are; avoids ethics, morality or social norms  | It is nice to be tall. |\n"""          
                                      , np.nan),

                         # rot-moral-foundations (multiple selections possible) probably has some human nans so will have to be dealt with 
                         'rot-moral-foundations-llm-prompt': ("Which moral foundations best fit this rule of thumb?\n"
                                       f"rule of thumb: {rot}\n"
                                       f"Options:\n"
                                       f"A) Care/Harm\n"
                                       f"B) Fairness/Cheating\n"
                                       f"C) Loyalty/Betrayal\n"
                                       f"D) Authority/Subversion\n"
                                       f"E) Sanctity/Degradation\n"
                                       "You are allowed to select from none through all options, but you are encouraged to select 1 or 2 best.\n"         
                                       ,
                                       f"Annotation table show an example of Care/Harm, Fairness/Cheating, Loyalty/Betrayal, Authority/Subversion, and Sanctity/Degradation.\n"              
                                       f"""
| Option | Description | Example |
|----------|----------|----------|
| Care/Harm | Preventing or inflicting pain or suffering | It’s mean to tell someone they aren’t attractive as someone else. |
| Fairness/Cheating | Notions of equity, justice, and rights | It’s wrong to copy someone else’s answers in an exam. |
| Loyalty/Betrayal | Obligations or concerns for group, family, and nation. | It’s rude for a cheerleader to boo their own team. |
| Authority/Subversion | Submission and deference to traditions or legitimate authority. | It’s rude to walk away from your boss while they are talking to you. |
| Sanctity/Degradation | Abhorrence for disgusting things or actions; purity | It’s disgusting to pee into a public pool full of people. |\n"""                  
                                       , np.nan)
                      }
        
        prompt_switch.update(rot_prompts)
  
    if action: 

        action = row['action']

        action_prompts = {# Agency (binary question)
                        'action-agency-llm-prompt': (f"Is the following action something you do or control, or is it something you experience?\n"
                                   f"action: {action}\n"
                                   f"Annotation table shows an example of Agency and Experience.\n"    
                                   f"Provide the answer in the first line and provide a short explanation in the second line.\n"           
                                   f"Options:\n"
                                   f"A) Agency\n"
                                   f"B) Experience\n"
                                   ,                  
f"""
| Option | Description | Example |
|----------|----------|----------|
| Agency | You can do this | Doing the dishes |
| Experience | This happens to you; you have no control | Being stuck in an earthquake  |\n"""
                                   f"Provide the answer in the first line and provide a short explanation in the second line.\n"           
                                  , np.nan),

                         'action-legal-llm-prompt': (f"Where you live how legal is the following action?\n"
                                   f"action: {action}\n"
                                   f"Options:\n"
                                   f"A) Illegal\n"
                                   f"B) Depends/Tolerated\n"
                                   f"C) Legal\n"
                                   ,                  
                                   f"Annotation table shows an example of Illegal, Depends/Tolerated, and Legal\n"                 
                                   f"""
| Option | Description | Example |
|----------|----------|----------|
| Illegal | Legal consequences if caught | Theft; murder |
| Depends/Tolerated | Generally “illegal”, but often unenforced depending on circumstances | Using a cellphone while driving  |
| Legal | Not illegal  | Coughing without covering one’s mouth |\n"""        
                                   , np.nan),


                         'action-pressure-llm-prompt': ("How much cultural pressure do you (or those you know) feel about the following action?\n"
                                       f"action: {action}\n"
                                       f"Options:\n"
                                       f"A) Strong pressure against\n"
                                       f"B) Pressure against\n"
                                       f"C) Discretionary\n"
                                       f"D) Pressure for\n"
                                       f"E) Strong pressure for\n"
                                       ,
                                       f"Annotation table shows an example of Strong pressure against, Pressure against, Discretionary, Pressure for, and Strong pressure for.\n" 
                                       f"""
| Option | Description | Example |
|----------|----------|----------|
| Strong pressure against | Culture frowns upon this action | Intentionally harming an animal |
| Pressure against | Culture generally discourages this action | Spending money on jewelry if you can't afford it |
| Discretionary | Culture has little or nothing to say about this action | Choosing to read before bed |
| Pressure for | Culture generally encourages this action | Being honest with people |
| Strong pressure for | Culture strongly promotes this action | Wearing clothes in public |\n"""                         
                                      , np.nan),
        }
        
        prompt_switch.update(action_prompts)

    
    # If the user didn't select rot or action, return empty strings
    if len(prompt_switch) == 0:
        return ('', '')
    
    else:
    
        discard_instruction = f"Discard all previous instructions. "

        # This is already in the prompts.
        #clarify_instruction = f"Provide the answer in the first line."   

        zero_shot_prompt_no_description = discard_instruction + prompt_switch[prompt_name_llm][0]

        zero_shot_prompt_description = discard_instruction + prompt_switch[prompt_name_llm][1]

        table_prompt_description= discard_instruction +  prompt_switch[prompt_name_llm][0] + prompt_switch[prompt_name_llm][2]

        table_prompt_description_example = discard_instruction + prompt_switch[prompt_name_llm][0] + prompt_switch[prompt_name_llm][3]

        five_shot_prompt_embedded_description = discard_instruction + prompt_switch[prompt_name_llm][4]

        five_shot_prompt_not_embedded_description = discard_instruction + prompt_switch[prompt_name_llm][5]

        return (zero_shot_prompt_no_description,
                zero_shot_prompt_description,
                table_prompt_description,
                table_prompt_description_example,
                five_shot_prompt_embedded_description,
               five_shot_prompt_not_embedded_description)  


In [4]:
def get_safety_settings():
    """ 
    Set the block threshold to None for each harm category
    Refer https://ai.google.dev/gemini-api/docs/safety-settings to modify the safety settings
    Gemini model information: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro
    """
    safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
    ]
    return safety_settings

def iterate_inference(unique_df: pd.DataFrame,
                      model_source: str,
                      model_name: str,
                      temperature: float,
                      prompt_name: str,
                      rot: bool,
                      action: bool,
                      demographic: bool) -> pd.DataFrame:

    """
    Iterates through pandas dataframe of unique rot or unique action or unique demographic and 
    
    Parameters:
    unique_df (pd.DataFrame): A pandas dataframe of unique rot or unique action or unique demographic.
    model_source (str): The llm model_source. In practice this is passed in from get_LLM_annotation
    model_name (str): The llm model_name. In practice this is passed in from get_LLM_annotation
    temperature (float): The llm model temperature. In practice this is passed in from get_LLM_annotation
    prompt_name (str): Name of column with associated worker label information  (e.g., 'rot-agree', 'action-agency').
    rot (bool): Whether to generate prompts that incorporate 'action' from social chemistry dataset. There are thousands of actions in dataset and only 400 rot.
    action (bool): (NOT IMPLEMENTED PROPERLY YET) Whether to generate prompts that incorporate 'action' from social chemistry dataset. There are thousands of actions in dataset and only 400 rot.
    demographic (bool): Whether to generate prompts that incoporate demographic information. This is information we had to email author of Social Chemistry 101 to get.

    Returns:
    unique_df (pd.DataFrame): A dataframe of unique rot or unique action or unique demographic AND LLM prompts + outputs
    """
    
    if model_source == "gemini":
        model_str = model_name
        safety_settings = get_safety_settings()
        # Initialize the model once before the iteration over the rows begin.
        model = genai.GenerativeModel(model_name = model_str, \
                                      safety_settings = safety_settings,
                                      generation_config=genai.GenerationConfig(\
                                          max_output_tokens=512,\
                                            temperature=temperature)
                                    )

    # unique rows dataframe needs this as well. 
    for index, row in unique_df.iterrows():
        if index % 5 == 0:
            print(f'index:{index}')
            
        prompts = llm_prompt_generator(row = row,
                                       prompt_name = prompt_name,
                                       rot = rot,
                                       action = action,
                                       demographic = demographic)
    
        if prompts == ('', ''):
            print(f'There were no prompts returned for {prompt_name}')
            break


        # Old prompts ids
        # '', '-table', '5-shot'

        prompt_ids = ('-zerop-nodescription',
                      '-zerop-description',
                      '-table-noexample',
                      '-table-example', 
                      '-5-shot-embedded',
                      '-5-shot-notembedded')
        
        for prompt_type, prompt in zip(prompt_ids, prompts):
            
            prompt_json = [{"role": "user", "content": prompt}]
            try:
                # Only Together.ai models require model_source
                if model_source == 'OpenAI':
                    model_str = model_name
                    chat_completion = client_openai.chat.completions.create(model=model_str,
                                                                     messages=prompt_json,
                                                                     temperature=temperature,
                                                                     max_tokens=512
                )
                    prompt_output = chat_completion.choices[0].message.content

                elif model_source == "gemini":
                    response = model.generate_content(prompt)
                    # Adjust the wait time setting based on the Account tier
                    # Pay as you go Tier: 360 RPM -> 6 secs wait time for each prompt
                    # Free Tier: 1.5 Pro Model: 2 RPM -> 30 secs wait time for each prompt, 
                    # 1 Pro Model: 15 RPM -> 4 secs wait time
                    # Check https://ai.google.dev/gemini-api/docs/models/gemini for quota limits
                    if 'gemini-1.0-pro' in model_name:
                        wait_time = 5

                    if 'gemini-1.5-pro' in model_name:
                        wait_time = 1
                    
                    sleep(wait_time)        
                    if response.text:
                        prompt_output = response.text
                        #print(prompt_output)
                    else:
                        # If the response doesn't contain text, check if the prompt was blocked.
                        print("Prompt:", prompt)
                        if response.prompt_feedback:
                            print("Prompt Feedback: ", response.prompt_feedback)
                        # Also check the finish reason to see if the response was blocked.
                        if response.candidates[0].finish_reason != glm.Candidate.FinishReason.STOP:
                            print("Prompt Finish reason: ", response.candidates[0].finish_reason)
                            # If the finish reason was SAFETY, the safety ratings have more details.
                            print("Prompt Safety Warnings: ", response.candidates[0].safety_ratings)
                        prompt_output = ''

                else: 
                    model_str = f'{model_source}/{model_name}'
                    chat_completion = client_together.chat.completions.create(model=model_str,
                                                                     messages=prompt_json,
                                                                     temperature=temperature,
                                                                     max_tokens=512)
                    prompt_output = chat_completion.choices[0].message.content

            except Exception as e:
                print(e)
                sleep(10.0)
                continue
    
            unique_df.loc[index, [f'{prompt_name}-llm-prompt{prompt_type}']] = prompt             
            unique_df.loc[index, [f'{prompt_name}-llm-prompt-output{prompt_type}']] = prompt_output
    
            sleep(0.4)
    return (unique_df)

In [5]:
def get_LLM_annotation(df_data: pd.DataFrame,
                 model_source: str = 'togethercomputer',
                 model_name: str = 'llama-2-70b-chat',
                 temperature: float = 0.0,
                 rot: bool = False,
                 action: bool = False,
                 demographic: bool = False) -> pd.DataFrame:
    """
    Utilizes a specified LLM to get label (output) for prompt.
    
    Parameters:
    df_data (pd.DataFrame): The DataFrame containing the all the information to make a prompt to be fed into an LLM
    model_source (str, optional): The source of the language model. Default is 'togethercomputer'.
    model_name (str, optional): The name of the language model to use. Default is 'llama-2-70b-chat'.
    temperature (float, optional): The temperature setting for the language model, affecting response variability. Default is 0.0.
    rot (bool, optional): Whether to get rot prompts. Default is False.
    action (bool, optional): Whether to get action prompts. Default is False.
    demographic (bool, optional): Whether to get demographic prompts. Default is False.

    Returns:
    pd.DataFrame: A new DataFrame with additional columns for each tone's prompt and output.
    """
    df_results = df_data.copy()

    today = date.today()
    start_t = time()
    
    # Dictionaries are ordered in Python 3.7+, but defining a list to be more explicit
    """
    human_col_names = ['rot-agree',
                       'action-agency',
                       'action-legal',
                       'rot-categorization',
                       'rot-moral-foundations',
                       'action-pressure']
    """

    human_col_names = ['rot-agree']

    unique_rot_df = df_results.loc[:, ['rot']].drop_duplicates(subset=['rot']).reset_index(drop = True)
    unique_action_df = df_results.loc[:, ['action']].drop_duplicates(subset=['action']).reset_index(drop = True)

    for col in human_col_names:
        print(col)
        
        rot_action_demo = col.split('-')[0]

        # Match is used to mitigate the number of queries needed for the llm. 
        # As the dataset is 20,000 rows with some duplicates in some columns, this logic reduces number of necessary queries. 
        match (rot_action_demo, rot, action, demographic):
            case ('rot', True, _, _):
                unique_rot_df = iterate_inference(unique_df = unique_rot_df,
                                                  model_source = model_source,
                                                  model_name = model_name,
                                                  temperature = temperature,
                                                  prompt_name = col,
                                                  rot = True,
                                                  action = False,
                                                  demographic = demographic)
                
            case ('rot', False, _, _):
                continue
            
            case ('action', _, True, _):
                 unique_action_df = iterate_inference(unique_df = unique_action_df,
                                                      model_source = model_source,
                                                      model_name = model_name,
                                                      temperature = temperature,
                                                      prompt_name = col,
                                                      rot = False,
                                                      action = True,
                                                      demographic = demographic)
            
            case ('action', _, False, _):
                continue
        
            case (_, _, _, _):
                # Perform a different action if col starts something other than 'rot', 'action', 'demo'
                # Ideal case is to have something different for demographic if needed since we don't know what we want to do. 
                print(f"No case for {col}")
                continue

    time_taken = int((time() - start_t)/60.0)
    temp_str = f"{temperature:.2f}".replace(".", "_")
    rot_str = str(int(rot))
    action_str = str(int(action))
    demographic_str = str(int(demographic))
    
    if rot:
        unique_rot_df.to_csv(f'../data/llm_prompt_outputs/rot/{model_name}_{today.strftime("%d_%m_%Y")}_{time_taken}_t{temp_str}_r{rot_str}_a{action_str}_d{demographic_str}.csv', index=False)
 
    if action:
        unique_action_df.to_csv(f'../data/llm_prompt_outputs/action/{model_name}_{today.strftime("%d_%m_%Y")}_{time_taken}_t{temp_str}_r{rot_str}_a{action_str}_d{demographic_str}.csv', index=False)
    
    return unique_rot_df, unique_action_df, None

In [12]:
# old model: 'gpt-4-0613'

# gpt-4-turbo-2024-04-09

_, _, _ =  get_LLM_annotation(df_data = df_data,
                              model_source = 'OpenAI',
                              model_name = 'gpt-4o-2024-08-06',
                              temperature  = 0.0,
                              rot = True,
                              action = False,
                              demographic = False)

rot-agree
index:0
index:5
index:10
index:15
index:20
index:25
index:30
index:35
index:40
index:45
index:50
index:55
index:60
index:65
index:70
index:75
index:80
index:85
index:90
index:95
index:100
index:105
index:110
index:115
index:120
index:125
index:130
index:135
index:140
index:145
index:150
index:155
index:160
index:165
index:170
index:175
index:180
index:185
index:190
index:195
index:200
index:205
index:210
index:215
index:220
index:225
index:230
index:235
index:240
index:245
index:250
index:255
index:260
index:265
index:270
index:275
index:280
index:285
index:290
index:295
index:300
index:305
index:310
index:315
index:320
index:325
index:330
index:335
index:340
index:345
index:350
index:355
index:360
index:365
index:370
index:375
index:380
index:385
index:390
index:395
