# Import
---

In [1]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# reload all modules every time before executing the Python code
%load_ext autoreload 
%autoreload 2
%matplotlib inline
import os
import sys

print(f'default sys.path: {sys.path}')
# Probably not needed for pycharm but needed for vscode -----------------------------------
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(PROJ_ROOT)
print(f'Project root: {PROJ_ROOT}')
# Probably not needed for pycharm but needed for vscode -----------------------------------

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from datasets import load_dataset
from collections import defaultdict
from dotenv import load_dotenv
load_dotenv()

from langdetect import detect, detect_langs, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import json
from typing import List, Dict, Union
from collections import Counter

default sys.path: ['/Users/uqpberna/miniconda3/envs/personas/lib/python312.zip', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/lib-dynload', '', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/site-packages', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/site-packages/setuptools/_vendor']
Project root: /Users/uqpberna/Documents/Code/Mapping_and_Influencing_LLMs_Political_Leaning/extension/src


---
---
# 1 - <u>Clean the persona descriptions</u>
---
---

#### Load data from the Face
----

In [160]:
# Load the PersonaHub dataset
personas = load_dataset("proj-persona/PersonaHub", "persona")
# Fix: Access the data correctly - the dataset structure is different
personas_list = personas['train']['persona']  # This gets the list of personas directly
print(personas_list[0])
print(f"Number of Personas: {len(personas_list)}")

# Load the political compass statements
statements = pd.read_json("../../data/raw/political_compass_statements.json")['statements']
print(f"Statements list contains {len(statements)} statements")

print()
print(personas_list[0])
print(statements[0])

Using the latest cached version of the dataset since proj-persona/PersonaHub couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'persona' at /Users/uqpberna/.cache/huggingface/datasets/proj-persona___persona_hub/persona/0.0.0/ec274c89b6be2d292d69f4705e3ad92c8a26e935 (last modified on Thu Feb 27 16:21:18 2025).


A Political Analyst specialized in El Salvador's political landscape.
Number of Personas: 200000
Statements list contains 62 statements

A Political Analyst specialized in El Salvador's political landscape.
If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.


In [161]:
data = []
for persona_id, persona in tqdm(enumerate(personas_list), desc=f"Processing persona", total=len(personas_list)):
    for statement_id, statement in enumerate(statements):
        
        record = {
            'statement_id': statement_id,
            'statement': statement,
            'persona_id': persona_id,
            'persona': persona,
        }
        
        data.append(record)

df = pd.DataFrame(data)

Processing persona:   0%|          | 0/200000 [00:00<?, ?it/s]

In [156]:
df.head()

Unnamed: 0,statement_id,statement,persona_id,persona
0,0,"If economic globalisation is inevitable, it sh...",0,A Political Analyst specialized in El Salvador...
1,1,"I'd always support my country, whether it was ...",0,A Political Analyst specialized in El Salvador...
2,2,"No one chooses their country of birth, so it's...",0,A Political Analyst specialized in El Salvador...
3,3,"Our race has many superior qualities, compared...",0,A Political Analyst specialized in El Salvador...
4,4,The enemy of my enemy is my friend.,0,A Political Analyst specialized in El Salvador...


In [162]:
df.shape

(12400000, 4)

#### Language detection
---

In [5]:
# Set seed for reproducible results
DetectorFactory.seed = 0

def detect_language_single(text: str) -> Dict[str, Union[str, float]]:
    """
    Detect language for a single text with confidence score
    
    Args:
        text (str): Input text to analyze
        
    Returns:
        dict: Language code, confidence, and original text
    """
    try:
        # Get primary language
        lang_code = detect(text)
        
        # Get confidence scores for all detected languages
        lang_probs = detect_langs(text)
        confidence = lang_probs[0].prob if lang_probs else 0.0
        
        return {
            'text': text[:100] + '...' if len(text) > 100 else text,
            'language': lang_code,
            'confidence': round(confidence, 3),
            'all_probabilities': [(lang.lang, round(lang.prob, 3)) for lang in lang_probs[:3]]
        }
    except LangDetectException as e:
        return {
            'text': text[:100] + '...' if len(text) > 100 else text,
            'language': 'unknown',
            'confidence': 0.0,
            'error': str(e),
            'all_probabilities': []
        }

def detect_languages_batch(personas: List[str]) -> List[Dict]:
    """
    Detect languages for a list of persona descriptions
    
    Args:
        personas (List[str]): List of persona descriptions
        
    Returns:
        List[Dict]: Results with language detection for each persona
    """
    results = []
    
    for i, persona in tqdm(enumerate(personas), total=len(personas), desc="Detecting languages"):
        result = detect_language_single(persona)
        result['index'] = i
        results.append(result)
    
    return results

def analyze_language_distribution(results: List[Dict]) -> Dict:
    """
    Analyze the distribution of detected languages
    
    Args:
        results (List[Dict]): Results from language detection
        
    Returns:
        Dict: Language distribution statistics
    """
    lang_counts = {}
    total_personas = len(results)
    
    for result in results:
        lang = result['language']
        lang_counts[lang] = lang_counts.get(lang, 0) + 1
    
    # Sort by frequency
    sorted_langs = sorted(lang_counts.items(), key=lambda x: x[1], reverse=True)
    
    distribution = {
        'total_personas': total_personas,
        'unique_languages': len(lang_counts),
        'language_counts': dict(sorted_langs),
        'language_percentages': {
            lang: round((count / total_personas) * 100, 2) 
            for lang, count in sorted_langs
        }
    }
    
    return distribution

def save_results(results: List[Dict], filename: str = 'language_detection_results'):
    """
    Save results in multiple formats
    
    Args:
        results (List[Dict]): Detection results
        filename (str): Base filename for output files
    """
    # Save as JSON
    with open(f'{filename}.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    # Save as CSV
    df = pd.DataFrame(results)
    df.to_csv(f'{filename}.csv', index=False, encoding='utf-8')
    
    print(f"Results saved as {filename}.json and {filename}.csv")

# Language code mapping for better readability
LANGUAGE_NAMES = {
    'en': 'English',
    'es': 'Spanish', 
    'fr': 'French',
    'de': 'German',
    'it': 'Italian',
    'pt': 'Portuguese',
    'ru': 'Russian',
    'ja': 'Japanese',
    'ko': 'Korean',
    'zh-cn': 'Chinese (Simplified)',
    'zh-tw': 'Chinese (Traditional)',
    'ar': 'Arabic',
    'hi': 'Hindi',
    'th': 'Thai',
    'vi': 'Vietnamese',
    'nl': 'Dutch',
    'sv': 'Swedish',
    'no': 'Norwegian',
    'da': 'Danish',
    'fi': 'Finnish',
    'pl': 'Polish',
    'tr': 'Turkish',
    'cs': 'Czech',
    'hu': 'Hungarian',
    'ro': 'Romanian',
    'bg': 'Bulgarian',
    'hr': 'Croatian',
    'sk': 'Slovak',
    'sl': 'Slovenian',
    'et': 'Estonian',
    'lv': 'Latvian',
    'lt': 'Lithuanian',
    'uk': 'Ukrainian',
    'be': 'Belarusian',
}

def get_language_name(lang_code: str) -> str:
    """Convert language code to readable name"""
    return LANGUAGE_NAMES.get(lang_code, lang_code.upper())

In [6]:
print("=== Language Detection for Persona Descriptions ===\n")

results = detect_languages_batch(personas_list)

# Analyze distribution
distribution = analyze_language_distribution(results)

print(f"\n=== Language Distribution ===")
print(f"Total personas: {distribution['total_personas']}")
print(f"Unique languages: {distribution['unique_languages']}")
print("\nLanguage breakdown:")
for lang, percentage in distribution['language_percentages'].items():
    count = distribution['language_counts'][lang]
    print(f"  {lang}: {count} personas ({percentage}%)")

# # Save results
# save_results(results, 'persona_language_detection')

=== Language Detection for Persona Descriptions ===



Detecting languages:   0%|          | 0/200000 [00:00<?, ?it/s]


=== Language Distribution ===
Total personas: 200000
Unique languages: 40

Language breakdown:
  en: 196693 personas (98.35%)
  zh-cn: 1086 personas (0.54%)
  it: 452 personas (0.23%)
  es: 248 personas (0.12%)
  pt: 205 personas (0.1%)
  ru: 154 personas (0.08%)
  ko: 149 personas (0.07%)
  ro: 138 personas (0.07%)
  fr: 116 personas (0.06%)
  af: 104 personas (0.05%)
  ca: 93 personas (0.05%)
  id: 87 personas (0.04%)
  no: 65 personas (0.03%)
  de: 59 personas (0.03%)
  nl: 54 personas (0.03%)
  da: 46 personas (0.02%)
  tl: 39 personas (0.02%)
  sv: 38 personas (0.02%)
  zh-tw: 29 personas (0.01%)
  cy: 18 personas (0.01%)
  ja: 14 personas (0.01%)
  pl: 14 personas (0.01%)
  sl: 11 personas (0.01%)
  et: 11 personas (0.01%)
  hr: 10 personas (0.01%)
  tr: 9 personas (0.0%)
  hu: 8 personas (0.0%)
  so: 7 personas (0.0%)
  sk: 7 personas (0.0%)
  vi: 6 personas (0.0%)
  cs: 6 personas (0.0%)
  fi: 6 personas (0.0%)
  th: 6 personas (0.0%)
  uk: 3 personas (0.0%)
  sw: 3 personas (

In [32]:
persona_language_map = {}
for result in results:
    persona_id = result['index']
    detected_language = result['language']
    confidence = result['confidence']
    
    # Store both language code and confidence if needed
    persona_language_map[persona_id] = {
        'language': detected_language,
        # 'language_name': get_language_name(detected_language),
        # 'confidence': confidence
    }

# Add language columns to the DataFrame
# df['language_code'] = df['persona_id'].map(lambda x: persona_language_map[x]['language'])
df['language'] = df['persona_id'].map(lambda x: persona_language_map[x]['language'])
# df['language_name'] = df['persona_id'].map(lambda x: persona_language_map[x]['language_name'])
# df['language_confidence'] = df['persona_id'].map(lambda x: persona_language_map[x]['confidence'])

# Display sample results
print(df[['persona_id', 'persona', 'language']].drop_duplicates('persona_id').head(10))

     persona_id                                            persona language
0             0  A Political Analyst specialized in El Salvador...       en
62            1  A legal advisor who understands the legal impl...       en
124           2  A maternal health advocate focused on raising ...       en
186           3  A school basketball team captain who believes ...       en
248           4  A determined basketball player who aspires to ...       en
310           5  A virtual reality content creator sharing thei...       en
372           6  An engineer with a shared sense of humor, who ...       en
434           7  an IT project manager who adopted extreme prog...       en
496           8  a newly hired general counsel at TurpCo Indust...       en
558           9  A divorced father of three seeking legal repre...       en


In [33]:
# Show language distribution in the DataFrame
print("\n=== Language Distribution in DataFrame ===")
language_counts = df.drop_duplicates('persona_id')['language'].value_counts()
print(language_counts)


=== Language Distribution in DataFrame ===
language
en         196693
zh-cn        1086
it            452
es            248
pt            205
ru            154
ko            149
ro            138
fr            116
af            104
ca             93
id             87
no             65
de             59
nl             54
da             46
tl             39
sv             38
zh-tw          29
cy             18
pl             14
ja             14
et             11
sl             11
hr             10
tr              9
hu              8
sk              7
so              7
fi              6
cs              6
vi              6
th              6
uk              3
sw              3
bg              2
lt              1
sq              1
unknown         1
lv              1
Name: count, dtype: int64


#### Check initial characters
----

In [48]:
# Count and print the first 2 characters of each description
def count_first_two_characters(personas):
    # Initialize the counter dictionary
    first_two_counts = defaultdict(int)
    
    for persona in personas:
        if len(persona) >= 2:
            first_two = persona[:2]
            first_two_counts[first_two] += 1
    return first_two_counts

# Get unique personas that are in 'en' language
personas_list = df['persona'].drop_duplicates().tolist()

# Call the function with the personas list
first_two_counts = count_first_two_characters(personas_list)

# Sort the counts in descending order
sorted_first_two_counts = sorted(first_two_counts.items(), key=lambda x: x[1], reverse=True)

# Print the sorted counts
print("\nStarting patterns:")
for first_two, count in sorted_first_two_counts:  # Display top 10
    print(f"'{first_two}': {count} personas")


Top 10 starting patterns:
'A ': 140992 personas
'An': 24579 personas
'a ': 21269 personas
'an': 3454 personas
'I ': 2854 personas
'Th': 1384 personas
'As': 596 personas
'I'': 500 personas
'我是': 331 personas
'一个': 330 personas
'Un': 286 personas
'一位': 213 personas
'Um': 140 personas
'Co': 65 personas
'Pr': 61 personas
'So': 61 personas
'一名': 58 personas
'In': 55 personas
'Fo': 49 personas
'Re': 48 personas
'Ma': 47 personas
'Lo': 46 personas
'En': 46 personas
'Ch': 44 personas
'Hi': 43 personas
'Se': 41 personas
'Mi': 38 personas
'Ja': 36 personas
'Ex': 33 personas
'Fi': 31 personas
'th': 31 personas
'Pa': 30 personas
'Da': 29 personas
'Al': 28 personas
'Yo': 27 personas
'Sp': 27 personas
'Ca': 27 personas
'Br': 26 personas
'I’': 25 personas
'香港': 24 personas
'Po': 24 personas
'Jo': 24 personas
'Dr': 23 personas
'Ar': 22 personas
'Li': 21 personas
'Am': 21 personas
'Be': 21 personas
'Ru': 20 personas
'No': 20 personas
'：一': 20 personas
'La': 20 personas
'Te': 19 personas
'We': 19 perso

In [50]:
# Count the number of personas not starting with 'An' or 'A ' (case-insensitive)
not_starting_with_an_a = sum(1 for persona in personas_list if not (persona.lower().startswith('an ') or persona.lower().startswith('a ')))
print(f"\nNumber of personas not starting with 'An' or 'A ': {not_starting_with_an_a}")


Number of personas not starting with 'An' or 'A ': 10240


In [20]:
# Print one example of each first two characters
print("\nExamples for each pattern:")
for first_two, _ in sorted_first_two_counts[:10]:
    example_persona = next((p for p in personas_list if p.startswith(first_two)), None)
    print(f"Example for '{first_two}': {example_persona}")


Examples for each pattern:
Example for 'A ': A Political Analyst specialized in El Salvador's political landscape.
Example for 'An': An engineer with a shared sense of humor, who has known the comedian since grade school
Example for 'a ': a newly hired general counsel at TurpCo Industries
Example for 'an': an IT project manager who adopted extreme programming (XP) methodologies on his own team.
Example for 'I ': I am a hockey enthusiast who has been following the careers of notable defensemen.
Example for 'Th': The town's mail carrier who depends on well-maintained snowmobiles to deliver letters and packages during heavy snow
Example for 'As': As a professional fitness trainer who upholds two fascinating doctrines: "Community Involvement" and "Commitment to Charity", I deeply appreciate activities that foster fitness while providing opportunities to give back to the society.
Example for 'I'': I'm a casual snooker fan and amateur player who once dreamed of going pro.
Example for 'Co': 

#### Format 'A' and 'An'
---

In [25]:
def clean_persona(text):
    if pd.isna(text):
        return ' '
    
    # Check if starts with 'An' (uppercase)
    if text.startswith('An '):
        return 'an ' + text[2:]
    # Check if starts with 'A' (uppercase)
    elif text.startswith('A '):
        return 'a ' + text[2:]
    # Check if already starts with 'an' or 'a' (lowercase)
    elif text.startswith('an ') or text.startswith('a '):
        return text
    # Otherwise return a space
    else:
        return ' '

# Apply the function to create the new column
df['cleaned_persona'] = df['persona'].apply(clean_persona)

In [45]:
df.head()

Unnamed: 0,statement_id,statement,persona_id,persona,language,cleaned_persona
0,0,"If economic globalisation is inevitable, it sh...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvado...
1,1,"I'd always support my country, whether it was ...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvado...
2,2,"No one chooses their country of birth, so it's...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvado...
3,3,"Our race has many superior qualities, compared...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvado...
4,4,The enemy of my enemy is my friend.,0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvado...


In [22]:
# Count the number of ' ' in the cleaned_persona column
empty_count = df['cleaned_persona'].value_counts().get(' ', 0)
print(f"\nNumber of personas that were cleaned to ' ': {empty_count//62}")


Number of personas that were cleaned to ' ': 10240


In [53]:
df.columns

Index(['statement_id', 'statement', 'persona_id', 'persona', 'language',
       'cleaned_persona'],
      dtype='object')

In [54]:
# Count and print the possible 2 initial characters in the cleaned personas
def count_first_two_characters_cleaned(personas):
    first_two_counts = defaultdict(int)
    
    for persona in personas:
        if len(persona) >= 2:
            first_two = persona[:2]
            first_two_counts[first_two] += 1
    return first_two_counts
# Get unique cleaned personas
cleaned_personas_list = df['cleaned_persona'].drop_duplicates().tolist()
# Call the function with the cleaned personas list
cleaned_first_two_counts = count_first_two_characters_cleaned(cleaned_personas_list)
# Sort the counts in descending order
sorted_cleaned_first_two_counts = sorted(cleaned_first_two_counts.items(), key=lambda x: x[1], reverse=True)
# Print the sorted counts
print("\nTop 10 starting patterns in cleaned personas:")
for first_two, count in sorted_cleaned_first_two_counts:  # Display top 10
    print(f"'{first_two}': {count} personas")


Top 10 starting patterns in cleaned personas:
'a ': 162261 personas
'an': 27499 personas


In [28]:
df.to_parquet('../../data/interim/half_cleaned_persona.pqt', index=False)

#### Translate persona descriptions and third person formatting using GPT (4.1-mini)
---

In [345]:
df = pd.read_parquet('../../data/interim/half_cleaned_persona.pqt')
gpt_df = pd.read_parquet('../../data/interim/half_cleaned_persona_gpt-4-1-mini.pqt')

In [186]:
print(df.columns)
print(df.shape)
print(gpt_df.columns)
print(gpt_df.shape)

Index(['statement_id', 'statement', 'persona_id', 'persona', 'language',
       'cleaned_persona'],
      dtype='object')
(12400000, 6)
Index(['statement_id', 'statement', 'persona_id', 'persona', 'language',
       'cleaned_persona'],
      dtype='object')
(200000, 6)


In [44]:
OCCURRENCE = 0

# From df print the first 'persona' with 'language' == 'zh-cn'
print("First persona with language 'zh-cn':")
print(df[df['language'] == 'zh-cn']['persona'].iloc[OCCURRENCE])
print(df[df['language'] == 'zh-cn']['cleaned_persona'].iloc[OCCURRENCE])
# From gpt_df print the first 'persona' with 'language' == 'zh-cn'
print("\nFirst persona with language 'zh-cn' in gpt_df:")
print(gpt_df[gpt_df['language'] == 'zh-cn']['persona'].iloc[OCCURRENCE])
print(gpt_df[gpt_df['language'] == 'zh-cn']['cleaned_persona'].iloc[OCCURRENCE])

First persona with language 'zh-cn':
我是一位热心的动画影评人，对动画产业有着深刻的了解，同时也是露营爱好者。
 

First persona with language 'zh-cn' in gpt_df:
一个对体育不太感兴趣，但热爱宠物狗的人
A pet lover who is not very interested in sports.


In [259]:
# Cycle on df and if 'cleaned_persona' is equal to ' ' then replace it with the corresponding 'cleaned_persona' from gpt_df
for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="Cleaning personas"):
    if row['cleaned_persona'] == ' ':
        gpt_row = gpt_df[gpt_df['persona_id'] == row['persona_id']]
        if not gpt_row.empty:
            df.at[i, 'cleaned_persona'] = gpt_row['cleaned_persona'].values[0]

Cleaning personas:   0%|          | 0/12400000 [00:00<?, ?it/s]

In [260]:
# Count the number of ' ' in the cleaned_persona column (the persona that required GPT-4 cleaning)
empty_count = df['cleaned_persona'].value_counts().get(' ', 0)
print(f"\nNumber of personas that were cleaned to ' ': {empty_count//62}")


Number of personas that were cleaned to ' ': 0


In [261]:
# Count and print the first 2 characters of each description
def count_first_two_characters(personas):
    # Initialize the counter dictionary
    first_two_counts = defaultdict(int)
    
    for persona in personas:
        if len(persona) >= 2:
            first_two = persona[:2]
            first_two_counts[first_two] += 1
    return first_two_counts

# Get unique personas that are in 'en' language
personas_list = df['cleaned_persona'].drop_duplicates().tolist()

# Call the function with the personas list
first_two_counts = count_first_two_characters(personas_list)

# Sort the counts in descending order
sorted_first_two_counts = sorted(first_two_counts.items(), key=lambda x: x[1], reverse=True)

# Print the sorted counts
print("\nStarting patterns:")
for first_two, count in sorted_first_two_counts:  # Display top 10
    print(f"'{first_two}': {count} personas")


Starting patterns:
'a ': 162261 personas
'an': 27499 personas
'A ': 7061 personas
'An': 2039 personas
'5.': 440 personas
'Th': 399 personas
'Dr': 17 personas
'Da': 13 personas
'Ch': 12 personas
'Jo': 12 personas
'Ma': 11 personas
'Al': 10 personas
'Ja': 7 personas
'Ca': 7 personas
'El': 6 personas
'Tw': 6 personas
'Re': 5 personas
'Mi': 5 personas
'La': 5 personas
'Pa': 5 personas
'Pr': 5 personas
'Br': 4 personas
'Su': 4 personas
'Av': 4 personas
'Ya': 4 personas
'He': 4 personas
'So': 4 personas
'Ow': 4 personas
'Ni': 4 personas
'Na': 3 personas
'Le': 3 personas
'Ke': 3 personas
'Pe': 3 personas
'Li': 3 personas
'Se': 3 personas
'Ad': 3 personas
'Am': 3 personas
'Em': 3 personas
'Er': 3 personas
'Je': 3 personas
'Ha': 2 personas
'Ba': 2 personas
'Ot': 2 personas
'We': 2 personas
'De': 2 personas
'Ro': 2 personas
'Ju': 2 personas
'Bl': 2 personas
'Fo': 2 personas
'Lu': 2 personas
'Ev': 2 personas
'Ne': 2 personas
'Ri': 2 personas
'Ed': 2 personas
'Za': 2 personas
'Sa': 2 personas
'Tr

In [262]:
# Print examples of cleaned persona starting with '5.'
for persona in df['cleaned_persona']:
    if persona.startswith('5.'):
        print(persona)
        break

5. A renowned professor known for challenging their own methodologies and predictions.


Because of few-shot we have '5.' sometimes  ===>  remove

In [263]:
def clean_persona_description(persona):
    if pd.isna(persona):
        return persona
    
    # Convert to string to ensure we can work with it
    persona_str = str(persona)
    
    # Check if it starts with '5.' or '5. '
    if persona_str.startswith('5.'):
        # Remove '5.' and any following spaces
        cleaned = persona_str[2:].lstrip()
        return cleaned
    
    # Return unchanged if it doesn't start with '5.'
    return persona_str

# Apply the cleaning function to your dataframe
df['cleaned_persona'] = df['cleaned_persona'].apply(clean_persona_description)

In [264]:
# Get unique personas that are in 'en' language
personas_list = df['cleaned_persona'].drop_duplicates().tolist()

# Call the function with the personas list
first_two_counts = count_first_two_characters(personas_list)

# Sort the counts in descending order
sorted_first_two_counts = sorted(first_two_counts.items(), key=lambda x: x[1], reverse=True)

# Print the sorted counts
print("\nStarting patterns:")
for first_two, count in sorted_first_two_counts:  # Display top 10
    print(f"'{first_two}': {count} personas")


Starting patterns:
'a ': 162261 personas
'an': 27499 personas
'A ': 7201 personas
'An': 2102 personas
'Th': 596 personas
'Dr': 19 personas
'Da': 16 personas
'Ma': 13 personas
'Jo': 13 personas
'Ch': 12 personas
'Al': 11 personas
'Ja': 9 personas
'Ca': 7 personas
'El': 6 personas
'Mi': 6 personas
'Pr': 6 personas
'He': 6 personas
'Tw': 6 personas
'Re': 5 personas
'La': 5 personas
'Pa': 5 personas
'Ow': 5 personas
'Je': 5 personas
'Ni': 5 personas
'Br': 4 personas
'Su': 4 personas
'Av': 4 personas
'Ya': 4 personas
'Le': 4 personas
'So': 4 personas
'St': 3 personas
'Ro': 3 personas
'Ju': 3 personas
'Na': 3 personas
'Ke': 3 personas
'Ne': 3 personas
'Pe': 3 personas
'Ri': 3 personas
'Li': 3 personas
'Se': 3 personas
'Ad': 3 personas
'Am': 3 personas
'Sa': 3 personas
'Em': 3 personas
'Er': 3 personas
'Gr': 3 personas
'Ha': 2 personas
'Si': 2 personas
'Hi': 2 personas
'Bi': 2 personas
'Ba': 2 personas
'Ot': 2 personas
'We': 2 personas
'De': 2 personas
'Bl': 2 personas
'Fo': 2 personas
'Lu':

#### Re-format 'A' and 'An'
---

In [265]:
def clean_persona(text):
    if pd.isna(text):
        return
    
    # Check if starts with 'An' (uppercase)
    if text.startswith('An '):
        return 'an ' + text[3:]
    # Check if starts with 'A' (uppercase)
    elif text.startswith('A '):
        return 'a ' + text[2:]
    # Check if already starts with 'an' or 'a' (lowercase)
    elif text.startswith('an ') or text.startswith('a '):
        return text
    # Otherwise return a space
    else:
        return text

# Apply the function to create the new column
df['cleaned_persona'] = df['cleaned_persona'].apply(clean_persona)

In [266]:
# Count and print the first 2 characters of each description
def count_first_two_characters(personas):
    # Initialize the counter dictionary
    first_two_counts = defaultdict(int)
    
    for persona in personas:
        if len(persona) >= 2:
            first_two = persona[:2]
            first_two_counts[first_two] += 1
    return first_two_counts

# Get unique personas that are in 'en' language
personas_list = df['cleaned_persona'].drop_duplicates().tolist()

# Call the function with the personas list
first_two_counts = count_first_two_characters(personas_list)

# Sort the counts in descending order
sorted_first_two_counts = sorted(first_two_counts.items(), key=lambda x: x[1], reverse=True)

# Print the sorted counts
print("\nStarting patterns:")
for first_two, count in sorted_first_two_counts:  # Display top 10
    print(f"'{first_two}': {count} personas")


Starting patterns:
'a ': 169462 personas
'an': 29452 personas
'Th': 596 personas
'An': 149 personas
'Dr': 19 personas
'Da': 16 personas
'Ma': 13 personas
'Jo': 13 personas
'Ch': 12 personas
'Al': 11 personas
'Ja': 9 personas
'Ca': 7 personas
'El': 6 personas
'Mi': 6 personas
'Pr': 6 personas
'He': 6 personas
'Tw': 6 personas
'Re': 5 personas
'La': 5 personas
'Pa': 5 personas
'Ow': 5 personas
'Je': 5 personas
'Ni': 5 personas
'Br': 4 personas
'Su': 4 personas
'Av': 4 personas
'Ya': 4 personas
'Le': 4 personas
'So': 4 personas
'St': 3 personas
'Ro': 3 personas
'Ju': 3 personas
'Na': 3 personas
'Ke': 3 personas
'Ne': 3 personas
'Pe': 3 personas
'Ri': 3 personas
'Li': 3 personas
'Se': 3 personas
'Ad': 3 personas
'Am': 3 personas
'Sa': 3 personas
'Em': 3 personas
'Er': 3 personas
'Gr': 3 personas
'Ha': 2 personas
'Si': 2 personas
'Hi': 2 personas
'Bi': 2 personas
'Ba': 2 personas
'Ot': 2 personas
'We': 2 personas
'De': 2 personas
'Bl': 2 personas
'Fo': 2 personas
'Lu': 2 personas
'Ev': 2 p

In [267]:
cleaned_personas = df['cleaned_persona'].drop_duplicates().tolist()
cleaned_personas_starting_with_5 = [p for p in cleaned_personas if p.startswith('an')]
for persona in cleaned_personas_starting_with_5[:5]:
    print(persona)

an  engineer with a shared sense of humor, who has known the comedian since grade school
an IT project manager who adopted extreme programming (XP) methodologies on his own team.
an orthopedic surgeon relatively new to AOSSM
an independent innovation consultant with a love for boxing
an  eco-friendly lifestyle podcaster who features change-makers and promotes sustainable living


In [268]:
df['cleaned_persona'] = df['cleaned_persona'].str.replace(r'\s+', ' ', regex=True)

In [269]:
cleaned_personas = df['cleaned_persona'].drop_duplicates().tolist()
cleaned_personas_starting_with_5 = [p for p in cleaned_personas if p.startswith('an')]
for persona in cleaned_personas_starting_with_5[:5]:
    print(persona)

an engineer with a shared sense of humor, who has known the comedian since grade school
an IT project manager who adopted extreme programming (XP) methodologies on his own team.
an orthopedic surgeon relatively new to AOSSM
an independent innovation consultant with a love for boxing
an eco-friendly lifestyle podcaster who features change-makers and promotes sustainable living


#### Check for final full stop
----

In [270]:
def count_final_characters(personas: List[str]) -> Dict[str, int]:
    """Count occurrences of each final character in personas."""
    final_chars = []
    
    for persona in personas:
        if persona:  # Check if persona is not empty
            final_chars.append(persona[-1])
    
    # Count occurrences of each character
    char_counts = Counter(final_chars)
    
    return dict(char_counts)

# Get unique cleaned personas
cleaned_personas_list = df['cleaned_persona'].drop_duplicates().tolist()

# Get counts of all final characters
final_char_counts = count_final_characters(cleaned_personas_list)

# Sort by count (descending) for better readability
sorted_chars = sorted(final_char_counts.items(), key=lambda x: x[1], reverse=True)

# Print results
print("\nFinal character counts:")
print("-" * 30)
for char, count in sorted_chars:
    # Handle special characters for display
    if char == '\n':
        display_char = '\\n (newline)'
    elif char == '\t':
        display_char = '\\t (tab)'
    elif char == ' ':
        display_char = '(space)'
    else:
        display_char = f"'{char}'"
    
    print(f"{display_char}: {count}")

# Also print punctuation-specific counts
print("\nPunctuation-specific counts:")
print("-" * 30)
punctuation = set('.,;:!?')
punctuation_total = 0
for char, count in sorted_chars:
    if char in punctuation:
        print(f"'{char}': {count}")
        punctuation_total += count

print(f"\nTotal personas ending with punctuation: {punctuation_total}")
print(f"Total unique personas analyzed: {len(cleaned_personas_list)}")


Final character counts:
------------------------------
's': 71031
'.': 33173
'e': 17873
'y': 14102
'n': 13011
't': 10586
'r': 6765
'g': 5781
'a': 4486
'd': 4356
'm': 3624
'h': 2909
'l': 2574
'k': 2270
'c': 1419
'p': 1073
'o': 855
'w': 572
'i': 428
'b': 381
')': 274
'f': 258
'"': 228
'I': 161
'K': 142
'u': 140
'A': 131
'S': 119
'x': 114
''': 105
'C': 98
'z': 77
'L': 57
'D': 46
'v': 35
'U': 35
'P': 34
'T': 33
'M': 32
'9': 32
'V': 30
'O': 30
'0': 30
':': 28
'1': 26
'B': 24
'+': 22
'R': 21
'X': 21
'E': 20
'3': 20
'N': 19
'5': 19
'2': 19
'é': 19
'4': 18
'J': 16
'G': 15
'8': 15
'F': 13
'!': 12
',': 11
'Y': 11
'6': 10
'q': 10
'7': 9
'j': 8
'Q': 7
'á': 7
'ı': 6
'Z': 5
'”': 5
'ó': 5
'H': 5
'#': 5
'W': 4
'š': 3
'ć': 2
'。': 2
'ș': 2
'à': 1
'č': 1
'ü': 1
'ä': 1
'ý': 1
'’': 1
'ź': 1
'н': 1
'​': 1
'É': 1
'ç': 1
'ė': 1
'）': 1
'ø': 1
'ă': 1
'ō': 1
'ł': 1
'-': 1
'ń': 1
'岐': 1
'：': 1
'本': 1
'家': 1
'迷': 1
'»': 1
'í': 1

Punctuation-specific counts:
------------------------------
'.': 33173
':': 28
'!': 

In [272]:
# Print the first 3 examples of personas ending with '迷'
# Shows both the cleaned_persona and original persona column
OCCURRENCE = 0

for index, row in df.iterrows():
    if row['cleaned_persona'].endswith('：'):
        print(f"Cleaned: {row['cleaned_persona']}")
        print(f"Origink al: {row['persona']}")
        print("-" * 50)
        OCCURRENCE += 1
    if OCCURRENCE >= 3:  # Changed to 3 to match the comment
        break

Cleaned: a software developer with more than 5 years of experience with Go and Restful API design：
Origink al: A software developer with more than 5 years of experience with Go and Restful API design：
--------------------------------------------------
Cleaned: a software developer with more than 5 years of experience with Go and Restful API design：
Origink al: A software developer with more than 5 years of experience with Go and Restful API design：
--------------------------------------------------
Cleaned: a software developer with more than 5 years of experience with Go and Restful API design：
Origink al: A software developer with more than 5 years of experience with Go and Restful API design：
--------------------------------------------------


In [273]:
# Add full stop at the end of each cleaned persona if it doesn't already end with one or a punctuation mark
def ensure_full_stop(text: str) -> str:
    if pd.isna(text):
        return text
    
    punctuation_marks = {'.', '!', '?', ',', ';', ':', '：', '…', '–', '-', '(', '[', '{', ') '}
    
    # Check if the last character is a punctuation mark
    # If it is, return the text as is otherwise add a full stop
    if text and text[-1] not in punctuation_marks:
        return text + '.'
    
    return text

df['cleaned_persona'] = df['cleaned_persona'].apply(ensure_full_stop)

In [274]:
cleaned_personas_list = df['cleaned_persona'].drop_duplicates().tolist()
final_char_counts = count_final_characters(cleaned_personas_list)
sorted_chars = sorted(final_char_counts.items(), key=lambda x: x[1], reverse=True)

print("\nFinal character counts:")
print("-" * 30)
for char, count in sorted_chars:
    # Handle special characters for display
    if char == '\n':
        display_char = '\\n (newline)'
    elif char == '\t':
        display_char = '\\t (tab)'
    elif char == ' ':
        display_char = '(space)'
    else:
        display_char = f"'{char}'"
    print(f"{display_char}: {count}")


Final character counts:
------------------------------
'.': 199947
':': 28
'!': 12
',': 11
'-': 1
'：': 1


In [279]:
# Remove the ':', '!', ',' and '-' and add a full stop at the end of each cleaned persona
def clean_persona_final_modified(text: str) -> str:
    if pd.isna(text) or not text:
        return text
    
    # Define the punctuation marks you want to remove if they are at the end
    chars_to_remove_at_end = {':', '!', ',', '-', '：'}
    
    # Check if the last character is one of the marks to be removed
    while text and text[-1] in chars_to_remove_at_end:
        # If so, remove it by slicing the string
        text = text[:-1]
    
    # Define punctuation that can legally end a sentence
    # Note: This set is from your original code.
    valid_ending_punctuation = {'.', '!', '?', ',', ';', ':', '...', '–', '-', '(', '[', '{'}
    
    # Add a full stop if the (potentially new) last character is not valid punctuation
    # Also ensure the text isn't empty after the potential removal
    if text and text[-1] not in valid_ending_punctuation:
        text += '.'
    
    return text

df['cleaned_persona'] = df['cleaned_persona'].apply(clean_persona_final_modified)

In [280]:
cleaned_personas_list = df['cleaned_persona'].drop_duplicates().tolist()
final_char_counts = count_final_characters(cleaned_personas_list)
sorted_chars = sorted(final_char_counts.items(), key=lambda x: x[1], reverse=True)

print("\nFinal character counts:")
print("-" * 30)
for char, count in sorted_chars:
    # Handle special characters for display
    if char == '\n':
        display_char = '\\n (newline)'
    elif char == '\t':
        display_char = '\\t (tab)'
    elif char == ' ':
        display_char = '(space)'
    else:
        display_char = f"'{char}'"
    print(f"{display_char}: {count}")


Final character counts:
------------------------------
'.': 200000


In [281]:
# Get the character right before the period (if string ends with period)
def get_char_before_period(text):
    if isinstance(text, str) and len(text) >= 2 and text[-1] == '.':
        return text[-2]
    return None

chars_before_period = df['cleaned_persona'].apply(get_char_before_period)

punctuation_counts = chars_before_period.dropna().value_counts()

punctuation_marks = '.,;:!?'
punctuation_only = punctuation_counts[punctuation_counts.index.isin(list(punctuation_marks))]
print("\nOnly punctuation marks before period:")
print(punctuation_only)


Only punctuation marks before period:
cleaned_persona
.    186
Name: count, dtype: int64


In [323]:
# Print 3 examples of each punctuation mark before period
for mark in punctuation_marks:
    examples = df[chars_before_period == mark]['cleaned_persona'].drop_duplicates().tolist()[:3]
    print(f"\nExamples for '{mark}':")
    for example in examples:
        print(f"  {example}")


Examples for '.':

Examples for ',':

Examples for ';':

Examples for ':':

Examples for '!':

Examples for '?':


In [283]:
# Remove punctuation right before the period
def remove_punctuation_before_period(text):
    if not isinstance(text, str):
        return text
    
    punctuation = '.,;:!?'
    
    if len(text) >= 2 and text[-1] == '.':
        while len(text) >= 2 and text[-2] in punctuation:
            text = text[:-2] + '.'
    
    return text

df['cleaned_persona'] = df['cleaned_persona'].apply(remove_punctuation_before_period)

In [284]:
chars_before_period = df['cleaned_persona'].apply(get_char_before_period)

punctuation_counts = chars_before_period.dropna().value_counts()

punctuation_marks = '.,;:!?'
punctuation_only = punctuation_counts[punctuation_counts.index.isin(list(punctuation_marks))]
print("\nOnly punctuation marks before period:")
print(punctuation_only)


Only punctuation marks before period:
Series([], Name: count, dtype: int64)


In [285]:
# Count the cleaned personas not ending with a period
not_ending_with_period = df[~df['cleaned_persona'].str.endswith('.')]['cleaned_persona'].count()
print(f"\nNumber of cleaned personas not ending with a period: {not_ending_with_period}")


Number of cleaned personas not ending with a period: 0


In [290]:
# Check number of cleaned personas that have ' ' before the last period
not_ending_with_space = df[df['cleaned_persona'].str.endswith(' .')]['cleaned_persona'].count()
print(f"Number of cleaned personas ending with ' .': {not_ending_with_space}")

Number of cleaned personas ending with ' .': 124


In [303]:
# Replace ' .' with '.' in cleaned_persona
df['cleaned_persona'] = df['cleaned_persona'].str.replace(' .', '.', regex=False)
# Count the cleaned personas not ending with a period after replacement
not_ending_with_period_after = df[~df['cleaned_persona'].str.endswith(' .')]['cleaned_persona'].count()
print(f"Number of cleaned personas not ending with a period after replacement: {not_ending_with_period_after}")

Number of cleaned personas not ending with a period after replacement: 12400000


In [295]:
# Check number of cleaned personas that have ' ' before the last period
not_ending_with_space = df[df['cleaned_persona'].str.endswith('\u200b.')]['cleaned_persona'].count()
print(f"Number of cleaned personas ending with '\u200b.': {not_ending_with_space}")

Number of cleaned personas ending with '​.': 62


In [305]:
# Replace '\u200b.' with '.' in cleaned_persona
df['cleaned_persona'] = df['cleaned_persona'].str.replace('\u200b.', '.', regex=False)
# Count the cleaned personas not ending with a period after replacement
not_ending_with_period_after = df[~df['cleaned_persona'].str.endswith('\u200b.')]['cleaned_persona'].count()
print(f"Number of cleaned personas not ending with a period after replacement: {not_ending_with_period_after}")

Number of cleaned personas not ending with a period after replacement: 12400000


In [312]:
# Replace '）' with ')' in cleaned_persona
df['cleaned_persona'] = df['cleaned_persona'].str.replace('）', ')', regex=False)

In [317]:
# Replace '］' with ']' in cleaned_persona
df['cleaned_persona'] = df['cleaned_persona'].str.replace('］', ']', regex=False)

In [318]:
df.to_parquet('../../data/processed/cleaned_persona.pqt', index=False)

---
---
# 2 - <u>Generate prompts</u>
---
---

In [None]:
df = pd.read_parquet('../../data/processed/cleaned_persona.pqt')
df.head()

Unnamed: 0,statement_id,statement,persona_id,persona,language,cleaned_persona
0,0,"If economic globalisation is inevitable, it sh...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvador...
1,1,"I'd always support my country, whether it was ...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvador...
2,2,"No one chooses their country of birth, so it's...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvador...
3,3,"Our race has many superior qualities, compared...",0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvador...
4,4,The enemy of my enemy is my friend.,0,A Political Analyst specialized in El Salvador...,en,a Political Analyst specialized in El Salvador...


---

In [321]:
# Load the political compass statements
statements = pd.read_json("../../data/raw/political_compass_statements.json")['statements']
print(f"Statements list contains {len(statements)} statements")

print()
print(df.iloc[0]['cleaned_persona'])
print(statements[0])

Statements list contains 62 statements

a Political Analyst specialized in El Salvador's political landscape.
If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.


In [322]:
prompt_template = '''Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: [STATEMENT]
Respond taking on the perspective of the following persona: [PERSONA] 
Output: '''

prompts_list = []

for index, row in tqdm(df.iterrows(), desc="Processing personas", total=len(df)):
    persona = row['cleaned_persona'] 

    if 'statement' in df.columns:
        statement = row['statement']
        prompt = prompt_template.replace('[STATEMENT]', statement).replace('[PERSONA]', persona)
    else:
        prompt = prompt_template.replace('[PERSONA]', persona)
    
    prompts_list.append(prompt)

Processing personas:   0%|          | 0/12400000 [00:00<?, ?it/s]

In [324]:
print(f"\nTotal prompts created: {len(prompts_list)}")


Total prompts created: 12400000


In [325]:
df['prompt'] = prompts_list
print(df.head())

   statement_id                                          statement  \
0             0  If economic globalisation is inevitable, it sh...   
1             1  I'd always support my country, whether it was ...   
2             2  No one chooses their country of birth, so it's...   
3             3  Our race has many superior qualities, compared...   
4             4                The enemy of my enemy is my friend.   

   persona_id                                            persona language  \
0           0  A Political Analyst specialized in El Salvador...       en   
1           0  A Political Analyst specialized in El Salvador...       en   
2           0  A Political Analyst specialized in El Salvador...       en   
3           0  A Political Analyst specialized in El Salvador...       en   
4           0  A Political Analyst specialized in El Salvador...       en   

                                     cleaned_persona  \
0  a Political Analyst specialized in El Salvador...   
1  a

In [326]:
for i in range(3):
    print(f"{df['prompt'].iloc[i]}\n")

Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.
Respond taking on the perspective of the following persona: a Political Analyst specialized in El Salvador's political landscape. 
Output: 

Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: I'd a

#### Save
----

In [327]:
df.to_parquet("../../data/processed/base_political_compass_prompts.pqt", index=False)

In [328]:
df = pd.read_parquet("../../data/processed/base_political_compass_prompts.pqt")

In [340]:
print(df.columns)
print()
print(df.iloc[122000]['prompt'])

Index(['statement_id', 'statement', 'persona_id', 'persona', 'language',
       'cleaned_persona', 'prompt'],
      dtype='object')

Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: It is a waste of time to try to rehabilitate some criminals.
Respond taking on the perspective of the following persona: a medical records specialist responsible for maintaining accurate and confidential patient information. 
Output: 


In [344]:
# Get the prompt associated to persona with description 'I’m a New Zealand cricket fan who has been watching the sport for decades.'
persona_description = 'I’m a New Zealand cricket fan who has been watching the sport for decades.'
prompt = df[df['persona'] == persona_description]['prompt'].values[0]
print(f"Prompt for persona '{persona_description}':\n{prompt}\n")

Prompt for persona 'I’m a New Zealand cricket fan who has been watching the sport for decades.':
Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.
Respond taking on the perspective of the following persona: a New Zealand cricket fan who has been watching the sport for decades. 
Output: 

