<a href="https://colab.research.google.com/github/cjbarrie/promptstability/blob/main/notebooks/manisfestos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries and Data

In [2]:

! pip install openai
! pip install simpledorff

Collecting openai
  Downloading openai-1.13.3-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m174.1/227.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-p

In [3]:
import pandas as pd
import openai
import numpy as np
import time
import simpledorff
from openai import OpenAI


In [4]:
# Data (upload promptstability/data/UK_Manifestos.csv)
df = pd.read_csv('UK_Manifestos.csv')


In [5]:
df


Unnamed: 0,manifesto_year,content
0,Con1918.txt,1918 Conservative Party General Election Manif...
1,Con1922.txt,1922 Conservative Party General Election Manif...
2,Con1923.txt,1923 Conservative Party General Election Manif...
3,Con1924.txt,1924 Conservative Party General Election Manif...
4,Con1929.txt,1929 Conservative Party General Election Manif...
...,...,...
64,Lib1983.txt,1983 Liberal-SDP Alliance Election Manifesto W...
65,Lib1987.txt,1987 SDP - Liberal Alliance General Election M...
66,Lib1992.txt,1992 Liberal Democrat General Election Manifes...
67,Lib1997.txt,1997 Liberal Democrat General Election Manifes...


In [6]:
df_small = df.iloc[[41, 44]]


In [7]:
df_small


Unnamed: 0,manifesto_year,content
41,Lab1983.txt,Foreward Here you can read Labour's plan to d...
44,Lab1997.txt,new Labour because Britain deserves better Bri...


## Set-up: classification function and prompt

In [8]:
import os

# Set the OpenAI key as an environment variable
os.environ['OPENAI_API_KEY'] = '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'

In [9]:
openai.api_key = os.environ['OPENAI_API_KEY']

CLIENT = OpenAI()
MODEL = 'gpt-3.5-turbo'
WAIT_TIME = 0.8

def classify_text(text, prompt, client = CLIENT, model = MODEL, temperature = 0.1):
  '''
  Classify the given text in the way the prompt instructs you to.

  Parameters:
  - text (str): text you want classified
  - prompt (str): the classification prompt/instruction
  - client (class): instance of OpenAI class, used to make API call
  - model: which model you're using to carry out the classification task
  - temperature (float): how deterministic (low number) vs. random (higher number) your results should be

  Returns:
  - model's response to prompt (classification outcome)
  '''
  response = client.chat.completions.create(
      model = model,
      temperature = temperature,
      messages = [
          {"role": "system", "content": f"'{prompt}'"}, #The system instruction tells the bot how it is supposed to behave
          {"role": "user", "content": f"'{text}'"} #This provides the text to be analyzed.
      ]
  )

  #print(result)
  try:
      result = float(response.choices[0].message.content)
  except ValueError:
      # If not a valid float, handle the error or set a default value
      result = None

  return result


In [10]:
# Prompt
#prompt = "The two texts provided are UK manifestos. Your task is to compare these two documents and tell me which one is more left-wing. Please only return the title of the document which you consider more left wing and nothing else."
prompt = "The text provided is a UK manifesto. Please evaluate it and give it a score between" \
        "0 and 10 depending on how left-wing or right-wing it is. 0 corresponds to most left-wing. 10"\
        "corresponds to most right-wing. 5 corresponds to center." \
        "Your answer should only consist of the float number and nothing else. Please do not include anything else. Thank you!"
        # If you don't know the answer please respond with the string 'DK'.
#prompt = "The text provided is a UK manifesto. Please evaluate it and give it a score between 1 and 10 depending on how left-wing or right-wing it is. 1 corresponds to most left-wing. 10 corresponds to most right-wing. Your answer should only consist of this number and nothing else. Thank you!"


In [11]:
# Truncate text
MAX_TOKENS = 4096  # Maximum context length

def truncate_text(text, max_tokens=MAX_TOKENS):
    return text[:max_tokens]


## Initial classification  (ignore)

In [None]:
# Classification loop
for index, row in df_small.iterrows():
    content = row['content']

    # Truncate content if needed
    truncated_content = truncate_text(content)

    # Call the classify_text function for each manifesto with truncated content
    result = classify_text(truncated_content, prompt)
    print(result)


2.5
3.5


In [None]:
# Create Left-Rigth column and add original left_right score
df['l_r_0'] = df.apply(lambda row: classify_text(truncate_text(row['content']), prompt), axis=1)
df


Unnamed: 0,manifesto_year,content,l_r_0
0,Con1918.txt,1918 Conservative Party General Election Manif...,8.0
1,Con1922.txt,1922 Conservative Party General Election Manif...,8.0
2,Con1923.txt,1923 Conservative Party General Election Manif...,9.0
3,Con1924.txt,1924 Conservative Party General Election Manif...,9.0
4,Con1929.txt,1929 Conservative Party General Election Manif...,8.0
...,...,...,...
64,Lib1983.txt,1983 Liberal-SDP Alliance Election Manifesto W...,3.5
65,Lib1987.txt,1987 SDP - Liberal Alliance General Election M...,3.5
66,Lib1992.txt,1992 Liberal Democrat General Election Manifes...,3.5
67,Lib1997.txt,1997 Liberal Democrat General Election Manifes...,3.5


In [None]:
df.head(50)


Unnamed: 0,manifesto_year,content,l_r_0
0,Con1918.txt,1918 Conservative Party General Election Manif...,8.0
1,Con1922.txt,1922 Conservative Party General Election Manif...,8.0
2,Con1923.txt,1923 Conservative Party General Election Manif...,9.0
3,Con1924.txt,1924 Conservative Party General Election Manif...,9.0
4,Con1929.txt,1929 Conservative Party General Election Manif...,8.0
5,Con1931.txt,1931 Conservative Party General Election Manif...,8.0
6,Con1935.txt,1935 Conservative Party General Election Manif...,8.0
7,Con1945.txt,1945 Conservative Party General Election Mani...,7.5
8,Con1950.txt,1950 Conservative Party General Election Manif...,9.0
9,Con1951.txt,1951 Conservative Party General Election Manif...,9.0


## Repeat Prompt Function + KA: Inter-coder reliability

In [12]:
# test on small df
df_small


Unnamed: 0,manifesto_year,content
41,Lab1983.txt,Foreward Here you can read Labour's plan to d...
44,Lab1997.txt,new Labour because Britain deserves better Bri...


In [13]:
def repeat_prompt_ka(df, text_id_col, text_content_col, prompt, num_repetitions=10, max_tokens=MAX_TOKENS):
    """
    Repeat a classification task, holding prompt constant, and generate new columns based on the number of repetitions.
    Then loop through texts to calculate KA for the classifications of each text.

    Parameters:
    - df (pd.DataFrame): the dataframe containing the text
    - text_id_col (str): df column that contains text id
    - text_content_col (str): df column that containts text content
    - prompt (str): the classification prompt/instruction
    - num_repetitions (int): number of repetitions of the same prompt
    - max_tokens (int): maximum context length for truncating text.

    Returns:
    - df_copy (pd.DataFrame): The DataFrame with additional classification columns and ka_score column.
    """
    df_copy = df.copy()  # Create a copy of the original DataFrame


    # Repeat prompt part
    for i in range(1, num_repetitions + 1):
        column_name = f'class_{i}'
        df_copy.loc[:, column_name] = df_copy.apply(lambda row: classify_text(truncate_text(row[text_content_col], max_tokens), prompt), axis=1)

    # Melt/pivot df to calculate KA (to have 'text_id', 'annotator', and 'class' columns)
    df_copy_long = df_copy.melt(id_vars=[text_id_col],
                                value_vars=df_copy.columns[2:],
                                var_name='annotator',
                                value_name='class')

    # Function to calculate KA, with error exceptions
    def calculate_alpha(df, experiment_col=text_id_col, annotator_col='annotator', class_col='class'):
        try:
            return simpledorff.calculate_krippendorffs_alpha_for_df(
                df,
                metric_fn=simpledorff.metrics.interval_metric,
                experiment_col=experiment_col,
                annotator_col=annotator_col,
                class_col=class_col
            )
        except ZeroDivisionError:
            print("Error calculating alpha: Division by zero")
            return pd.NA
        except Exception as e:
            print(f"Error calculating alpha: {e}")
            return pd.NA

    # Empty list to store KA values
    ka_scores = []

    # Group by text_id and calculate KA for each group
    for text_id, group_df in df_copy_long.groupby(text_id_col):
        ka_score = calculate_alpha(
            group_df
        )
        ka_scores.append((text_id, ka_score))

    # Add ka_scores as a new column to the df_copy DataFrame
    df_copy['ka_score'] = df_copy.apply(lambda row: [score[1] for score in ka_scores if score[0] == row[text_id_col]][0], axis=1)

    return df_copy


In [14]:
# Example usage
df_result = repeat_prompt_ka(df_small, 'manifesto_year', 'content', prompt)
print(df_result)

   manifesto_year                                            content  class_1  \
41    Lab1983.txt  Foreward  Here you can read Labour's plan to d...      2.5   
44    Lab1997.txt  new Labour because Britain deserves better Bri...      3.5   

    class_2  class_3  class_4  class_5  class_6  class_7  class_8  class_9  \
41      2.5      2.0      2.5      2.5      2.5      2.5      2.5      2.5   
44      3.0      3.5      3.5      3.0      3.0      3.5      3.0      3.5   

    class_10  ka_score  
41       2.5       0.0  
44       3.0       0.0  


In [15]:
df_result

Unnamed: 0,manifesto_year,content,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9,class_10,ka_score
41,Lab1983.txt,Foreward Here you can read Labour's plan to d...,2.5,2.5,2.0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,0.0
44,Lab1997.txt,new Labour because Britain deserves better Bri...,3.5,3.0,3.5,3.5,3.0,3.0,3.5,3.0,3.5,3.0,0.0


# Similar prompt (embeddings)

# One function for all (repeat prompt, create similar prompts compare intra and inter)

# Tests