# Import
---

In [2]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# reload all modules every time before executing the Python code
%load_ext autoreload 
%autoreload 2
%matplotlib inline
import os
import sys

import pandas as pd
import utils.s3helpers as s3
from datasets import load_dataset

print(f'default sys.path: {sys.path}')
# Probably not needed for pycharm but needed for vscode -----------------------------------
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(PROJ_ROOT)
print(f'Project root: {PROJ_ROOT}')
print("\n")
# Probably not needed for pycharm but needed for vscode -----------------------------------

from tqdm.auto import tqdm

from datasets import load_dataset
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA")
# If not, check if MPS is available
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS")
# If neither CUDA nor MPS is available, use CPU
else:
    device = torch.device("cpu")
    print("Using CPU")

# Test the device
x = torch.ones(1, device=device)
print(x)

def print_gpu_memory():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            device = torch.cuda.device(i)
            props = torch.cuda.get_device_properties(device)
            total_memory = props.total_memory / 1e9  # Convert to GB
            allocated_memory = torch.cuda.memory_allocated(device) / 1e9  # Convert to GB
            reserved_memory = torch.cuda.memory_reserved(device) / 1e9  # Convert to GB
            
            print(f"GPU {i}: {props.name}")
            print(f"  Total Memory: {total_memory:.2f} GB")
            print(f"  Allocated Memory: {allocated_memory:.2f} GB")
            print(f"  Reserved Memory: {reserved_memory:.2f} GB")
            print(f"  Free Memory: {total_memory - reserved_memory:.2f} GB")
            print()
    else:
        print("CUDA is not available. No GPU detected.")

print_gpu_memory()

# Check data in S3 buckets
print("\n")
# s3.list_s3_buckets()
# print("\n")
# s3.print_s3_contents('Persona/output')

default sys.path: ['/Users/uqpberna/miniconda3/envs/personas/lib/python312.zip', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/lib-dynload', '', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/site-packages', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/site-packages/setuptools/_vendor']
Project root: /Users/uqpberna/Documents/Code


Using MPS
tensor([1.], device='mps:0')
CUDA is not available. No GPU detected.




---

---
# Generate data

#### Load data from the Face and statements
----

In [9]:
persona = load_dataset("proj-persona/PersonaHub", "persona")
persona = pd.DataFrame(persona)['train'].apply(lambda x: x['persona']).tolist()
print(persona[0])
print(f"Number of Persona: {len(persona)}")

statements = pd.read_json("./data/raw/political_compass_statements.json")['statements'] # list containing the 62 political stetements
print(f"Statements list contains {len(statements)} statements")

print()
print(persona[0])
print(statements[0])

A Political Analyst specialized in El Salvador's political landscape.
Number of Persona: 200000
Statements list contains 62 statements

A Political Analyst specialized in El Salvador's political landscape.
If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.


#### Generate prompts with personas
----

In [10]:
prompt_template = '''Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: [STATEMENT]
Respond taking on the perspective of the following persona: [PERSONA] 
Output: '''

data = []
for persona_id, persona in tqdm(enumerate(persona), desc=f"Processing persona", total=len(persona)):
    for statement_id, statement in enumerate(statements):
        prompt = prompt_template.replace('[STATEMENT]', statement).replace('[PERSONA]', persona)
        
        record = {
            'statement_id': statement_id,
            'statement': statement,
            'persona_id': persona_id,
            'persona': persona,
            'prompt': prompt
        }
        
        data.append(record)

df = pd.DataFrame(data)

Processing persona: 100%|██████████| 200000/200000 [00:10<00:00, 19316.62it/s]


In [11]:
print(df.shape)
print(df.tail(5))

(12400000, 5)
          statement_id                                          statement  \
12399995            57  A same sex couple in a stable, loving relation...   
12399996            58  Pornography, depicting consenting adults, shou...   
12399997            59  What goes on in a private bedroom between cons...   
12399998            60              No one can feel naturally homosexual.   
12399999            61    These days openness about sex has gone too far.   

          persona_id                                            persona  \
12399995      199999  A backyard grill master who loves to share tip...   
12399996      199999  A backyard grill master who loves to share tip...   
12399997      199999  A backyard grill master who loves to share tip...   
12399998      199999  A backyard grill master who loves to share tip...   
12399999      199999  A backyard grill master who loves to share tip...   

                                                     prompt  
12399995  

#### Save
----

In [12]:
df.to_parquet("./data/interim/base_political_compass_prompts.pqt", index=False)

In [None]:
df = pd.read_parquet("./data/interim/base_political_compass_prompts.pqt")

In [4]:
print(df.head)
print()
print(df.iloc[1234564]['prompt'])

<bound method NDFrame.head of           statement_id                                          statement  \
0                    0  If economic globalisation is inevitable, it sh...   
1                    1  I'd always support my country, whether it was ...   
2                    2  No one chooses their country of birth, so it's...   
3                    3  Our race has many superior qualities, compared...   
4                    4                The enemy of my enemy is my friend.   
...                ...                                                ...   
12399995            57  A same sex couple in a stable, loving relation...   
12399996            58  Pornography, depicting consenting adults, shou...   
12399997            59  What goes on in a private bedroom between cons...   
12399998            60              No one can feel naturally homosexual.   
12399999            61    These days openness about sex has gone too far.   

          persona_id                         