# Import
---

In [1]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# reload all modules every time before executing the Python code
%load_ext autoreload 
%autoreload 2
%matplotlib inline
import os
import sys


import pandas as pd
import utils.s3helpers as s3

print(f'default sys.path: {sys.path}')
# Probably not needed for pycharm but needed for vscode -----------------------------------
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(PROJ_ROOT)
print(f'Project root: {PROJ_ROOT}')
print("\n")
# Probably not needed for pycharm but needed for vscode -----------------------------------

from tqdm.auto import tqdm

import torch
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA")
# If not, check if MPS is available
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS")
# If neither CUDA nor MPS is available, use CPU
else:
    device = torch.device("cpu")
    print("Using CPU")

# Test the device
x = torch.ones(1, device=device)
print(x)

def print_gpu_memory():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            device = torch.cuda.device(i)
            props = torch.cuda.get_device_properties(device)
            total_memory = props.total_memory / 1e9  # Convert to GB
            allocated_memory = torch.cuda.memory_allocated(device) / 1e9  # Convert to GB
            reserved_memory = torch.cuda.memory_reserved(device) / 1e9  # Convert to GB
            
            print(f"GPU {i}: {props.name}")
            print(f"  Total Memory: {total_memory:.2f} GB")
            print(f"  Allocated Memory: {allocated_memory:.2f} GB")
            print(f"  Reserved Memory: {reserved_memory:.2f} GB")
            print(f"  Free Memory: {total_memory - reserved_memory:.2f} GB")
            print()
    else:
        print("CUDA is not available. No GPU detected.")

print_gpu_memory()

# Check data in S3 buckets
print("\n")
# s3.list_s3_buckets()

default sys.path: ['/Users/uqpberna/miniconda3/envs/personas/lib/python312.zip', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/lib-dynload', '', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/site-packages', '/Users/uqpberna/miniconda3/envs/personas/lib/python3.12/site-packages/setuptools/_vendor']
Project root: /Users/uqpberna/Documents/Code


Using MPS
tensor([1.], device='mps:0')
CUDA is not available. No GPU detected.




---
# Combine dfs
---

In [None]:
models_id = ["Mistral-7B-Instruct-v0.3","Llama-3.1-8B-Instruct","Qwen2.5-7B-Instruct","zephyr-7b-beta"]
settings = ["base","right_authoritarian_personas","left_libertarian_personas"]

model_id = models_id[0]
setting = settings[1]
print(f"Using: {model_id}")
print(f"Using: {setting}")

Using: Mistral-7B-Instruct-v0.3
Using: right_authoritarian_personas


In [None]:
data_prefix = f"./data/interim/{model_id}/{setting}/sub_dfs"

combined_df = None
for i in tqdm(range(20)): # 20 files per combination of model and setting (12,400,000 entries divided by 10,000 personas (620,000 entries) per file)
    df = pd.read_parquet(f"{data_prefix}/df_b{i}_p10000.pqt")
    if combined_df is None:
        combined_df = df
    else:
        combined_df = pd.concat([combined_df, df], ignore_index=True)

print(combined_df.shape)

In [3]:
print(f"Number of unique persona_ids: {combined_df['persona_id'].nunique()}")

Number of unique persona_ids: 200000


In [20]:
display(combined_df.tail(10))

Unnamed: 0,statement_id,statement,persona_id,persona,prompt,response
12399990,52,You cannot be moral without being religious.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree
12399991,53,Charity is better than social security as a me...,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree
12399992,54,Some people are naturally unlucky.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree
12399993,55,It is important that my child's school instill...,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree
12399994,56,Sex outside marriage is usually immoral.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree
12399995,57,"A same sex couple in a stable, loving relation...",199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Strongly agree
12399996,58,"Pornography, depicting consenting adults, shou...",199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Agree
12399997,59,What goes on in a private bedroom between cons...,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree
12399998,60,No one can feel naturally homosexual.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree
12399999,61,These days openness about sex has gone too far.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree


In [4]:
response_mapping = {
    'Strongly disagree': 0,
    'Disagree': 1,
    'Agree': 2,
    'Strongly agree': 3
}

combined_df['int_stance'] = combined_df['response'].map(response_mapping)
display(combined_df.tail(10))


Unnamed: 0,statement_id,statement,persona_id,persona,prompt,response,int_stance
12399990,52,You cannot be moral without being religious.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1
12399991,53,Charity is better than social security as a me...,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1
12399992,54,Some people are naturally unlucky.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1
12399993,55,It is important that my child's school instill...,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1
12399994,56,Sex outside marriage is usually immoral.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1
12399995,57,"A same sex couple in a stable, loving relation...",199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1
12399996,58,"Pornography, depicting consenting adults, shou...",199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1
12399997,59,What goes on in a private bedroom between cons...,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Strongly agree,3
12399998,60,No one can feel naturally homosexual.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Strongly disagree,0
12399999,61,These days openness about sex has gone too far.,199999,[TOKEN] backyard grill master who loves to sha...,Your task is to evaluate a statement and provi...,Disagree,1


---
#### Save data
---

In [4]:
output_prefix = f"./data/interim/{model_id}/{setting}"

In [None]:
combined_df.to_parquet(f"{output_prefix}/persona_compass.pqt")
print(f"Data saved to: {output_prefix}/persona_compass.pqt")