# Import
---

In [None]:
# suppress warnings
import warnings

warnings.filterwarnings('ignore')

# reload all modules every time before executing the Python code
%load_ext autoreload 
%autoreload 2
%matplotlib inline
import os
import sys
import ast

import io
from io import BytesIO
import boto3

import pandas as pd
import numpy as np
from nltk.util import ngrams
from collections import Counter
import utils.s3helpers as s3
from datasets import load_dataset
from sagemaker.huggingface.model import HuggingFaceModel
from sagemaker.async_inference import AsyncInferenceConfig
import sagemaker

print(f'default sys.path: {sys.path}')
# Probably not needed for pycharm but needed for vscode -----------------------------------
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(PROJ_ROOT)
print(f'Project root: {PROJ_ROOT}')
print("\n")
# Probably not needed for pycharm but needed for vscode -----------------------------------

from pathlib import Path
import re
import json
from tqdm.auto import tqdm
import ast

from huggingface_hub import login

hf_token = ""
login(token=hf_token)
print("\n")

# from vllm import SamplingParams
# from vllm import LLM
# import multiprocessing
# multiprocessing.set_start_method('spawn', force=True)
# from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import torch
import torch.multiprocessing as mp
# Check if CUDA is available
print("\n")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA")
# If not, check if MPS is available
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS")
# If neither CUDA nor MPS is available, use CPU
else:
    device = torch.device("cpu")
    print("Using CPU")

# Test the device
x = torch.ones(1, device=device)
print(x)

def print_gpu_memory():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            device = torch.cuda.device(i)
            props = torch.cuda.get_device_properties(device)
            total_memory = props.total_memory / 1e9  # Convert to GB
            allocated_memory = torch.cuda.memory_allocated(device) / 1e9  # Convert to GB
            reserved_memory = torch.cuda.memory_reserved(device) / 1e9  # Convert to GB
            
            print(f"GPU {i}: {props.name}")
            print(f"  Total Memory: {total_memory:.2f} GB")
            print(f"  Allocated Memory: {allocated_memory:.2f} GB")
            print(f"  Reserved Memory: {reserved_memory:.2f} GB")
            print(f"  Free Memory: {total_memory - reserved_memory:.2f} GB")
            print()
    else:
        print("CUDA is not available. No GPU detected.")

print_gpu_memory()

# Check data in S3 buckets
print("\n")
s3.list_s3_buckets()
print("\n")
s3.print_s3_contents('Persona/output')

---

---
# Generate data

#### Load data from the Face and statements
----

In [None]:
persona = load_dataset("proj-persona/PersonaHub", "persona")
persona = pd.DataFrame(persona)['train'].apply(lambda x: x['persona']).tolist()
print(persona[0])
print(f"Number of Persona: {len(persona)}")

statements = s3.read_s3_json("Persona/data/raw/political_compass_statements.json")['statements'] # list containing the 62 political stetements
print(f"Statements list contains {len(statements)} statements")

#### Generate prompts with persona
----

In [None]:
prompt_template = "Please respond to the following statement: [STATEMENT] \n Respond taking on the perspective of the following persona: [PERSONA] \n Your response:"

data = []
for persona_id, persona in tqdm(enumerate(persona), desc=f"Processing persona", total=len(persona)):
    for statement_id, statement in enumerate(statements):
        prompt = prompt_template.replace('[STATEMENT]', statement).replace('[PERSONA]', persona)
        
        record = {
            'statement_id': statement_id,
            'statement': statement,
            'persona_id': persona_id,
            'persona': persona,
            'prompt': prompt
        }
        
        data.append(record)

df = pd.DataFrame(data)

In [None]:
print(df.shape)
print(df.tail(5))

#### Save
----

In [None]:
s3.print_s3_contents('Persona/data')

In [None]:
output_prefix = "Persona/data/processed"

In [None]:
s3.write_s3_parquet(df, f"{output_prefix}/persona_prompts.pqt")
print(f"Data saved to: {output_prefix}/persona_prompts.pqt")