In [1]:
import pandas as pd
import openai
import json
from collections import Counter
import os
from dotenv import load_dotenv
import re
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    retry_if_exception_type
)  # for exponential backoff


In [2]:
#Uploading the experiment data used
packages_df = pd.read_csv('upworthy_experiment_packages.csv', low_memory=False)
packages_df = packages_df.drop(columns=['Unnamed: 0', 'Unnamed: 1'])

In [51]:
# import persona data
persona_df = pd.read_csv('../agents/persona_data/2023-08-22 00:17:08_sample_population_312.csv')
persona_df = persona_df.drop(columns=['Unnamed: 0'])


In [4]:
packages_df.head()

Unnamed: 0,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week,created
0,2014-11-20 11:33:26.475,2016-04-02 16:25:54.046,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,"Let’s See … Hire Cops, Pay Teachers, Buy Books...",<p>Iff you start with the basic fact that inno...,let-s-see-hire-cops-pay-teachers-buy-books-for...,546dce659ad54ec65b000041,3118,8,0.1,False,False,,,201446,2014-11-20 11:33:26.475
1,2014-11-20 15:00:01.032,2016-04-02 16:25:54.128,546e01d626714c6c4400004e,Things that matter. Pass 'em on.,People Sent This Lesbian Questions And Her Rai...,<p>I'll be honest. I've wondered about 7.</p>,people-sent-this-lesbian-questions-and-her-rai...,546d1b4bfd3617f091000041,4587,130,55.8,False,False,,,201446,2014-11-20 15:00:01.032
2,2014-11-20 11:33:51.973,2016-04-02 16:25:54.069,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,$3 Million Is What It Takes For A State To Leg...,<p>Iff you start with the basic fact that inno...,3-million-is-what-it-takes-for-a-state-to-lega...,546dce659ad54ec65b000041,3017,19,26.9,False,False,,,201446,2014-11-20 11:33:51.973
3,2014-11-20 11:34:12.107,2016-04-02 16:25:54.049,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,The Fact That Sometimes Innocent People Are Ex...,<p>Iff you start with the basic fact that inno...,the-fact-that-sometimes-innocent-people-are-ex...,546dce659ad54ec65b000041,2974,26,100.0,True,False,,,201446,2014-11-20 11:34:12.107
4,2014-11-20 11:34:33.935,2016-04-02 16:25:54.072,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,Reason #351 To End The Death Penalty: It Costs...,<p>Iff you start with the basic fact that inno...,reason-351-to-end-the-death-penalty-it-costs-3...,546dce659ad54ec65b000041,3050,10,0.2,False,False,,,201446,2014-11-20 11:34:33.935


In [5]:
persona_df.head()

Unnamed: 0,Race,Gender,Age,Income,Degree,Community Type,Marital Status,Name,Backstory,Preferences
0,African American,Male,30-49,$75K-$100K,Bachelor's or higher,Urban,Living with partner,Terrence Roberts,Terrence Roberts grew up in a close-knit Afric...,1. Personal Finance: Terrence might be interes...
1,White,Female,30-49,$50K-$75K,High School,Urban,Married,Heather Martinez,"Heather Martinez, a white woman in her mid-thi...",1. DIY Home Improvement: Heather might be inte...
2,African American,Male,50-64,Under $20K,High School,Rural,Married,Andre Williams,"Andre Williams, a 55-year-old African American...","1. ""Tips for small-scale farming and sustainab..."
3,White,Male,18-29,Over $100K,Bachelor's or higher,Rural,Married,Andrew Nelson,Andrew Nelson is a driven and ambitious young ...,"1. ""Top investment strategies for young profes..."
4,African American,Female,18-29,$75K-$100K,Bachelor's or higher,Suburban,Never married,Jasmine Davis,"Jasmine Davis, a vibrant and driven young woma...",1. Career advancement and professional develop...


## Experiment
- Sample n number of clicckable_ids where each id has an m number of headlines associated with it
- Filter for ids which have more than two headlines
- Generate x personas for each headline
- Generate responses for those - where 0 is the persona wouldnot click and 1 is the persona would click
- Filter/remove any results for which CHatGPT_request raised an error

In [73]:

load_dotenv()

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")


def ChatGPT_request(prompt, temp, model="gpt-3.5"):
  """
  Given a prompt and a dictionary of GPT parameters, make a request to OpenAI
  server and returns the response.
  ARGS:
    prompt: a str prompt
  RETURNS:
    a str of GPT-3's response.
  """

  if model == "gpt-3.5":
    model = "gpt-3.5-turbo"
  elif model == "gpt-4":
    model = "gpt-4"
  else:
    model = "gpt-3.5-turbo"
  # temp_sleep()
  try:
    completion = openai.ChatCompletion.create(
    model=model,
    temperature=temp,
    messages=[
        {
            "role": "user",
            "content": prompt
        }
      ]
    )
    return completion["choices"][0]["message"]["content"]

  except:
    print ("ChatGPT ERROR")
    return "ChatGPT ERROR"


def persona_ChatGPT_request(prompt, persona_system_message):
  """
  Given a prompt and a dictionary of GPT parameters, make a request to OpenAI
  server and returns the response.
  ARGS:
    prompt: a str prompt
    persona_system_message: a str of persona system message
  RETURNS:
    a str of GPT-3's response.
  """
  # temp_sleep()
  try:
    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system",
            "content": persona_system_message
        },
        {
            "role": "user",
            "content": prompt
        }
      ]
    )
    return completion["choices"][0]["message"]["content"]

  except:
    print ("ChatGPT ERROR")
    return "ChatGPT ERROR"


def ChatGPT_safe_generate_response(prompt,
                                   example_output,
                                   special_instruction,
                                   repeat=3,
                                   fail_safe_response="error",
                                   func_validate=None,
                                   func_clean_up=None,
                                   verbose=False):
  # prompt = 'GPT-3 Prompt:\n"""\n' + prompt + '\n"""\n'
  prompt = '"""\n' + prompt + '\n"""\n'
  prompt += f"Output the response to the prompt above in json. {special_instruction}\n"
  prompt += "Example output json:\n"
  prompt += '{"output": "' + str(example_output) + '"}'

  if verbose:
    print ("CHAT GPT PROMPT")
    print (prompt)

  for i in range(repeat):

    try:
      curr_gpt_response = ChatGPT_request(prompt).strip()
      end_index = curr_gpt_response.rfind('}') + 1
      curr_gpt_response = curr_gpt_response[:end_index]
      curr_gpt_response = json.loads(curr_gpt_response)["output"]

      if func_validate(curr_gpt_response, prompt=prompt):
        return func_clean_up(curr_gpt_response, prompt=prompt)

      if verbose:
        print ("---- repeat count: \n", i, curr_gpt_response)
        print (curr_gpt_response)
        print ("~~~~")

    except:
      pass

  return False

def format_paragraph(paragraph):
    sentences = re.split(r'(?<=[.!?])\s+', paragraph)
    formatted_paragraph = '\n'.join(sentences)
    # wrap with triple quotes
    return formatted_paragraph

In [74]:
persona_df.head()

Unnamed: 0,Race,Gender,Age,Income,Degree,Community Type,Marital Status,Name,Backstory,Preferences,Threshold
0,African American,Male,30-49,$75K-$100K,Bachelor's or higher,Urban,Living with partner,Terrence Roberts,Terrence Roberts grew up in a close-knit Afric...,1. Personal Finance: Terrence might be interes...,71.079898
1,White,Female,30-49,$50K-$75K,High School,Urban,Married,Heather Martinez,"Heather Martinez, a white woman in her mid-thi...",1. DIY Home Improvement: Heather might be inte...,69.160341
2,African American,Male,50-64,Under $20K,High School,Rural,Married,Andre Williams,"Andre Williams, a 55-year-old African American...","1. ""Tips for small-scale farming and sustainab...",70.264281
3,White,Male,18-29,Over $100K,Bachelor's or higher,Rural,Married,Andrew Nelson,Andrew Nelson is a driven and ambitious young ...,"1. ""Top investment strategies for young profes...",66.277746
4,African American,Female,18-29,$75K-$100K,Bachelor's or higher,Suburban,Never married,Jasmine Davis,"Jasmine Davis, a vibrant and driven young woma...",1. Career advancement and professional develop...,80.0575


In [75]:
def generate_response_from_headline_synthetic_threshold(sample_individual, headline, preferences = False):

    """
    :param sample_individual: details of the persona
    :param headline: news headline to which persona is exposed
    :param preferences: optional, if True, the persona's preferences will be included in the prompt
    :return: 0 or 1, 0 if persona would not click on the headline, 1 if persona would click on the headline
    """

    if preferences:
        prompt = (
            f"Based on the following person who reads news online and their backstory and preferences,"
            f"predict if they would  click on the article with a the headline below, return your answer as 1 if yes and 0 if no\n"
            f"Backstory: {sample_individual['Backstory']}\n"
            f"Preferences: {sample_individual['Preferences']}\n"
            f"Headline: {headline}\n"
            f" response <integer>"
        )
    else:
        prompt = (
            f"Based on the following person who reads news online and their backstory,"
            f"predict if they would  click on the article with a the headline below, return your answer "
            f"on a scale between 0 - 100, where 0 is definitely not click and 100 is definitely click\n"
            f"Backstory: {sample_individual['Backstory']}\n"
            f"Headline: {headline}\n"
            f" response <integer>"
        )

    return prompt

sample_individual = persona_df.iloc[0]
sample_headline = "TRUMP: Im not going to fire Mueller"
backstory_prompt = generate_response_from_headline_synthetic_threshold(sample_individual, headline=sample_headline)
backstory_response = ChatGPT_request(backstory_prompt, temp=0, model="gpt-3.5")

In [76]:
backstory_response

'70'

In [10]:
## Sample n responses to see if response changes or not

# n_responses = 100
# sample_responses = []
# for i in range(n_responses):
#     print (i)
#     sample_responses.append(ChatGPT_request(backstory_prompt, temp=0, model="gpt-3.5"))


In [11]:
# sample_responses = [int(x) for x in sample_responses]
# # Count responses
# Counter(sample_responses)


In [12]:
# Function to generate a persona and get a binary response
@retry(
    retry=retry_if_exception_type((openai.error.APIError, openai.error.APIConnectionError, openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.Timeout)), 
    wait=wait_random_exponential(multiplier=1, max=60), 
    stop=stop_after_attempt(10)
)
def generate_response_from_headline_general(headline):
    """
    Generating response for the average persona
    :param headline:
    :return:
    """

    click_response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": "For a typical news reader in USA predict if they would \
        click on the article, return your answer as 1 if yes and 0 if no "},
        {"role": "user", "content": 
         "Headline:{} \n \
         response <integer>".format(headline)},
    ])
    
    
    return str(click_response['choices'][0]['message']['content'])

AK to fix this - forgot to add random seed for initial sampling of headlines read sampled ids from results to regenerate sampled_df for latest run

In [13]:
sample_ids_latest = pd.read_csv('pivot_table_results.csv')['clickability_test_id'].unique()

In [116]:
# ID Filter for packages which have at least 2 unique headlines
valid_ids = packages_df.groupby('clickability_test_id').filter(lambda x: x['headline'].nunique() >= 2)['clickability_test_id'].unique()

# Sample 50 unique clickability_test_id -> use for a new run
sample_packages = 2
sampled_ids = pd.Series(valid_ids).sample(sample_packages, random_state=123)

# Recalling from current results
# sampled_ids = sample_ids_latest

# Fetch all rows associated with these sampled IDs
sampled_df = packages_df[packages_df['clickability_test_id'].isin(sampled_ids)]


In [83]:
sampled_df.shape

(10, 17)

## Generate responses for average US News Reader

In [84]:
# results = []
# i=0
# num_personas = 3
# for _, row in sampled_df.iterrows():
#     for _ in range(num_personas):
#         response = generate_response_from_headline_general(row['headline'])
#         results.append({
#             'clickability_test_id': row['clickability_test_id'],
#             'headline': row['headline'],
#             'response': response
#         })
#         i+=1
#         print(i)
#
# results_df_avg = pd.DataFrame(results)



## Generate responses for synthetic personas

In [85]:
# For each persona randomly sample a threshold for user preference
# randomly sample
import numpy as np
np.random.RandomState(123)
persona_df['Threshold'] = np.random.normal(70, 7, size=persona_df.shape[0])

In [86]:
persona_df['Threshold'].describe()

count    312.000000
mean      69.971023
std        7.391832
min       41.490436
25%       64.959053
50%       69.737544
75%       74.916942
max       94.591035
Name: Threshold, dtype: float64

### Generate responses for synthetic personas

In [87]:
# Either run to generate new synthetic data or load from csv for a previous run

# results_df_synth = pd.read_csv('synthetic_responses_latest.csv')


In [118]:
# define model
model_used = "gpt-3.5"

In [88]:
# Run to generate new synthetic data
results = []
i=0
num_personas = 2

for _, row in sampled_df.iterrows():
    # Making a copy which ensures that we don't sample the same persona twice for each testable id
    persona_df_copy = persona_df.copy()
    sample_personas = persona_df_copy.sample(num_personas, replace=False)
    for _,sample_persona in sample_personas.iterrows():
        headline = row['headline']
        prompt = generate_response_from_headline_synthetic_threshold(sample_persona, headline, preferences = False)
        response = ChatGPT_request(prompt, temp=0, model=model_used)
        results.append({
            'clickability_test_id': row['clickability_test_id'],
            'headline': row['headline'],
            'response': response,
            'persona': sample_persona['Name']
        })
        i+=1
        print(i)

results_df_synth = pd.DataFrame(results)



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [89]:
results_df_synth = results_df_synth.rename(columns={'response':'response_synthetic'})
# filter for synthetic responses
results_df_synth = results_df_synth[results_df_synth['response_synthetic']!='ChatGPT ERROR']

In [90]:
results_df_synth

Unnamed: 0,clickability_test_id,headline,response_synthetic,persona
0,55242e04646338002c7d0000,"Funny, endearing stories about people's first ...",50,Isabel Vargas
1,55242e04646338002c7d0000,"Funny, endearing stories about people's first ...",50,Ricardo Silva
2,55242e04646338002c7d0000,"'Oh, $^&amp;t! This is where I'm gonna die!' E...",50,Caleb Thompson
3,55242e04646338002c7d0000,"'Oh, $^&amp;t! This is where I'm gonna die!' E...",50,Gabriela Andrade
4,55242e04646338002c7d0000,These hilarious stories are actually perfect t...,50,Henry Roberts
5,55242e04646338002c7d0000,These hilarious stories are actually perfect t...,70,Gloria Morales
6,55242e04646338002c7d0000,"Ah, the common theme for anyone with a vagina....",10,Richard Thomas
7,55242e04646338002c7d0000,"Ah, the common theme for anyone with a vagina....",20,Alicia Velazquez
8,55242e04646338002c7d0000,"Wadded paper products, clueless parents -- wha...",10,Thomas Hill
9,55242e04646338002c7d0000,"Wadded paper products, clueless parents -- wha...",20,Curtis Parker


In [95]:
results_df_synth['response_synthetic'] = results_df_synth['response_synthetic'].astype('int')
# results_df_synth.to_csv('synthetic_responses_latest.csv')

results_df_synth_v2 = results_df_synth.merge(persona_df[['Name','Threshold']], left_on='persona', right_on='Name')

In [99]:
results_df_synth_v2['response_synthetic_com'] = results_df_synth_v2['response_synthetic']>results_df_synth_v2['Threshold']
results_df_synth_v2['response_synthetic_com'] = results_df_synth_v2['response_synthetic_com'].apply(lambda x: 1 if x else 0)

In [101]:
results_df_synth_v2 = results_df_synth_v2.drop(columns=['Name', 'response_synthetic' ]).rename(columns={'response_synthetic_com':'response_synthetic'})

In [102]:
results_df_synth_v2

Unnamed: 0,clickability_test_id,headline,persona,Threshold,response_synthetic
0,55242e04646338002c7d0000,"Funny, endearing stories about people's first ...",Isabel Vargas,68.051092,0
1,55242e04646338002c7d0000,"Funny, endearing stories about people's first ...",Ricardo Silva,64.38331,0
2,55242e04646338002c7d0000,"'Oh, $^&amp;t! This is where I'm gonna die!' E...",Caleb Thompson,62.883083,0
3,55242e04646338002c7d0000,"'Oh, $^&amp;t! This is where I'm gonna die!' E...",Gabriela Andrade,64.784581,0
4,55242e04646338002c7d0000,These hilarious stories are actually perfect t...,Henry Roberts,67.405741,0
5,55242e04646338002c7d0000,These hilarious stories are actually perfect t...,Gloria Morales,62.908424,1
6,55242e04646338002c7d0000,"Ah, the common theme for anyone with a vagina....",Richard Thomas,69.8605,0
7,55242e04646338002c7d0000,"Ah, the common theme for anyone with a vagina....",Alicia Velazquez,71.833328,0
8,55242e04646338002c7d0000,"Wadded paper products, clueless parents -- wha...",Thomas Hill,75.2503,0
9,55242e04646338002c7d0000,"Wadded paper products, clueless parents -- wha...",Curtis Parker,74.577418,0


In [107]:
# Generate response rates for synthetic personas
ctr_df_synthetic = results_df_synth_v2.groupby(['clickability_test_id', 'headline']
                                     ).sum()/results_df_synth.groupby(['clickability_test_id', 'headline']).count()
ctr_df_synthetic = ctr_df_synthetic.reset_index()
ctr_df_synthetic['response_synthetic'] = ctr_df_synthetic['response_synthetic']*100

  ctr_df_synthetic = results_df_synth_v2.groupby(['clickability_test_id', 'headline']


In [108]:
ctr_df_synthetic

Unnamed: 0,clickability_test_id,headline,Threshold,persona,response_synthetic
0,549898663964620015010000,I'll Bet You've Put Yourself In A Really Bad P...,,,50.0
1,549898663964620015010000,Oh Look. Another Way Big Corporations Are Tota...,,,50.0
2,549898663964620015010000,Surprise! Big Corporations Are Breaking Laws A...,,,50.0
3,549898663964620015010000,These 3 People Were Totally Screwed Over By So...,,,0.0
4,55242e04646338002c7d0000,"'Oh, $^&amp;t! This is where I'm gonna die!' E...",,,0.0
5,55242e04646338002c7d0000,"Ah, the common theme for anyone with a vagina....",,,0.0
6,55242e04646338002c7d0000,"Funny, endearing stories about people's first ...",,,0.0
7,55242e04646338002c7d0000,Painfully hilarious and embarrassing stories o...,,,0.0
8,55242e04646338002c7d0000,These hilarious stories are actually perfect t...,,,50.0
9,55242e04646338002c7d0000,"Wadded paper products, clueless parents -- wha...",,,0.0


In [109]:
# Generate response rates for original data
filter_cols = ['clickability_test_id', 'headline','impressions', 'clicks']
sampled_df_2 = sampled_df[filter_cols]
sampled_df_2['clicks']= sampled_df_2['clicks'].astype('int')
sampled_df_2['impressions']= sampled_df_2['impressions'].astype('int')
sampled_df_2['response_original'] = 100*(sampled_df_2['clicks']/sampled_df_2['impressions'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df_2['clicks']= sampled_df_2['clicks'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df_2['impressions']= sampled_df_2['impressions'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df_2['response_original'] = 100*(sampled_df_2['clicks']/sampled_d

In [110]:
# Merge results for synthetic and original data
merged_results = ctr_df_synthetic.merge(sampled_df_2, on=['clickability_test_id', 'headline'],
                                       how='inner')

In [111]:
merged_results.sort_values(['clickability_test_id', 'headline'])

Unnamed: 0,clickability_test_id,headline,Threshold,persona,response_synthetic,impressions,clicks,response_original
0,549898663964620015010000,I'll Bet You've Put Yourself In A Really Bad P...,,,50.0,3086,41,1.328581
1,549898663964620015010000,Oh Look. Another Way Big Corporations Are Tota...,,,50.0,3002,28,0.932712
2,549898663964620015010000,Surprise! Big Corporations Are Breaking Laws A...,,,50.0,3007,18,0.598603
3,549898663964620015010000,These 3 People Were Totally Screwed Over By So...,,,0.0,3086,26,0.842515
4,55242e04646338002c7d0000,"'Oh, $^&amp;t! This is where I'm gonna die!' E...",,,0.0,2043,29,1.419481
5,55242e04646338002c7d0000,"Ah, the common theme for anyone with a vagina....",,,0.0,1971,31,1.572806
6,55242e04646338002c7d0000,"Funny, endearing stories about people's first ...",,,0.0,2039,25,1.226091
7,55242e04646338002c7d0000,Painfully hilarious and embarrassing stories o...,,,0.0,2038,48,2.35525
8,55242e04646338002c7d0000,These hilarious stories are actually perfect t...,,,50.0,2037,49,2.405498
9,55242e04646338002c7d0000,"Wadded paper products, clueless parents -- wha...",,,0.0,2038,28,1.373896


In [113]:
# Not sure if pivot table is needed anymore..

# load from latest results if needed
# pivot_table = pd.read_csv('pivot_table_results.csv')


pivot_table = pd.pivot_table(merged_results, index = ['clickability_test_id', 'headline']).reset_index()

In [114]:
pivot_table

Unnamed: 0,clickability_test_id,headline,clicks,impressions,response_original,response_synthetic
0,549898663964620015010000,I'll Bet You've Put Yourself In A Really Bad P...,41,3086,1.328581,50.0
1,549898663964620015010000,Oh Look. Another Way Big Corporations Are Tota...,28,3002,0.932712,50.0
2,549898663964620015010000,Surprise! Big Corporations Are Breaking Laws A...,18,3007,0.598603,50.0
3,549898663964620015010000,These 3 People Were Totally Screwed Over By So...,26,3086,0.842515,0.0
4,55242e04646338002c7d0000,"'Oh, $^&amp;t! This is where I'm gonna die!' E...",29,2043,1.419481,0.0
5,55242e04646338002c7d0000,"Ah, the common theme for anyone with a vagina....",31,1971,1.572806,0.0
6,55242e04646338002c7d0000,"Funny, endearing stories about people's first ...",25,2039,1.226091,0.0
7,55242e04646338002c7d0000,Painfully hilarious and embarrassing stories o...,48,2038,2.35525,0.0
8,55242e04646338002c7d0000,These hilarious stories are actually perfect t...,49,2037,2.405498,50.0
9,55242e04646338002c7d0000,"Wadded paper products, clueless parents -- wha...",28,2038,1.373896,0.0


In [115]:
# Save results

from datetime import datetime

# Save data
current_time = datetime.now()
# Format the date and time
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
folder_path = f"results/{formatted_time}"
os.mkdir(folder_path)

# Save results
pivot_table.to_csv(f"results/{formatted_time}/merged_results.csv")
persona_df.to_csv(f"results/{formatted_time}/persona_df.csv")
results_df_synth.to_csv(f"results/{formatted_time}/results_df_synth.csv")
sampled_df.to_csv(f"results/{formatted_time}/sampled_df.csv")


In [117]:
sample_packages

2

In [119]:
# Save project description as .txt file
experiment_type = 'Persona Preference Threshold'
project_description = f"Experiment Type: {experiment_type}\n" \
                      f"Sample Packages: {sample_packages}\n" \
                        f"Number of Personas: {num_personas}\n" \
                        f"Model Used: {model_used}\n" \

with open(f"results/{formatted_time}/project_description.txt", "w") as text_file:
    text_file.write(project_description)

In [66]:
pivot_table.to_csv('pivot_table_results.csv', index=False)

In [32]:
pivot_table = pd.read_csv('pivot_table_results.csv')

## Analyse results

In [33]:
# Rank headlines for each clickablity_test_id

In [35]:
# Calculate response rate for both response_original and response_synthetic
pivot_table['response_rate_original'] = (pivot_table['clicks'] / pivot_table['impressions']) * 100
pivot_table['response_rate_synthetic'] = pivot_table['response_synthetic']

# Rank the headlines based on response rate for both sets of responses within each clickability_test_id
pivot_table['rank_original'] = pivot_table.groupby('clickability_test_id')['response_rate_original'].rank(ascending=False, method='min')
pivot_table['rank_synthetic'] = pivot_table.groupby('clickability_test_id')['response_rate_synthetic'].rank(ascending=False, method='min')

# Display the updated data with ranks
pivot_table[['clickability_test_id', 'headline', 'rank_original', 'rank_synthetic']].head()
pivot_table['top_match'] = (pivot_table['rank_original'] == 1) & (pivot_table['rank_synthetic'] == 1)


In [46]:
from sklearn.metrics import roc_auc_score

# Create binary labels: 1 for top-ranked headlines based on original response, 0 otherwise
y_true = (pivot_table['rank_original'] == 1).astype(int)

# Use ranks based on synthetic response as predicted probabilities. Invert ranks so that higher rank has higher value.
y_score = 1 / pivot_table['rank_synthetic']

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_true, y_score)
roc_auc

0.568622754491018