## import

In [None]:
# Standard library imports
import importlib
import os
import re
import time
import json
from typing import List 
import numpy as np
from numpy.linalg import norm
import pandas as pd
import ollama
from openai import OpenAI

# Local imports from tools package
from tools import brikasutils as bu
from tools import shared_utils as utils
from tools.shared_utils import systemMsg, userMsg, assistantMsg
from tools import survey
from tools import persona

# Reload modules
importlib.reload(bu)
importlib.reload(utils)
importlib.reload(survey)
importlib.reload(persona)

## Load embeddings from file

In [None]:
EMBEDDING_NAMEID = "airidas_finalboss_1"

import json
with open(f"embeddings/{EMBEDDING_NAMEID}_embeddings.json", "r") as f:
    data = json.load(f)
    chunks = data["chunks"]
    embeddings = data["embeddings"]

try:
    with open(f"embeddings/{EMBEDDING_NAMEID}_info.json", "r") as f:
        AUTO_INFO = json.load(f)
        try:
            EMBED_MODEL = AUTO_INFO["model"]
            chunk_size = AUTO_INFO["CHUNK_SIZE"]
        except KeyError:
            print("WARNING: Info text does not contain model information")
except:
    print("WARNING: No Info file found. Make sure embedding model is matching.")

print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

In [None]:
RETRIAVAL_METHOD = "dynamic" #static/dynamic/hybrid
PROMPT_METHOD = "IMPERSONATE" #ARE/IMPERSONATE

SUBJECT = "Elias"

# Load Embeddings From File (optional)
import json
# with open(f"data/4_chunks/{EMBEDDING_ID}-{CHUNKS_COUNT_IN_CTX}_{VERSION_ID}-dynamic_embeddings.json", "r") as f:
with open(f"data/4_chunks/{CHECKPOINT}-{RETRIAVAL_METHOD}_embeddings.json", "r") as f:
    data = json.load(f)
    chunks = data["chunks"]
    embeddings = data["embeddings"]
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

# "PERSONA_TEXT": "Favorite video games are Minecraft, Fortnite, and Call of Duty.",
# "MED_MODULE": " "
# SURVEY_PROMPT = "Determine how much {subject} aggree with the statement. Guestimate how {subject} would answer to the question"
TINY_MODULE = "You are Elias, a 24 year old business and IT student from Copenhagen, where you now live in a dormatory."

####################### You are {SUBJECT} vs you will impersonate {SUBJECT} #####################
PREP_CHECKPOINT = f"{CHECKPOINT}-{RETRIAVAL_METHOD}_{SUBJECT}-{PROMPT_METHOD}"
PREP_CHECKPOINT

In [None]:
########################################### Method A ############################################
if PROMPT_METHOD == "IMPERSONATE":
    pre_prompt_template = """
SYS_MSG = {
    "role": "system", 
    "content": f"You are an expert actor, specializing in impersonation of non-famouns people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit {SURVEY} by shadowing chats between the subject and friends. You will be asked to fully immerse yourself in the role, and answer questions from the point of view of the persona. \\n\\n**The persona, which you will be tasked to mimick is named '{SUBJECT}'.** \\n#Context \\n##Chat conversations between the subject and their friends:\\n**From most to least related**\\n"
}
ASSIST_MSG = {
    "role": "assistant",
    "content": f"Understood. I will answer from the point of view of the persona, {SUBJECT}, based on what I could the deduct from the text provided."
}
USER_MSG = {
    "role": "user",
    "content": f"Persona is questioned about their {SURVEY} in an {METHOD}. The persona must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction."
}
"""
########################################### Method B ###########################################
elif PROMPT_METHOD == "ARE":
    pre_prompt_template = """
SYS_MSG = {
    "role": "system", 
    "content": f"**{TINY_MODULE}**. You have shared your thoughts, feelings, and experiences through text messages with friedns. Answer the following questions honestly and naturally, as you would in everyday conversations. \\n\\n#Context \\n##Conversations between persona and friends:"
}
ASSIST_MSG = {
    "role": "assistant",
    "content": f"Understood. I am {SUBJECT}, and I will answer the survey to the best of my ability."
}
USER_MSG = {   
    "role": "user",
    "content": f"The survey is about your {SURVEY}. You must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Your answer must only contain the chosen option, without any elaboration, nor introduction.\n**From most to least related**\\n"
}
"""

exec(pre_prompt_template)

print(f"{SYS_MSG['content']}")

In [None]:
VARIABLES = {
    "Which survey": surv,
    # "Prompt method": f"You are {SUBJECT} vs you will impersonate {SUBJECT}",
    # "Retrieval method": "Dynamic/static/hybrid",
    "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
    "EMBEDDING_ID": EMBEDDING_ID,
    "VERSION_ID": VERSION_ID,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "EMBED_MODEL": EMBED_MODEL,
    "CHUNKS_COUNT_IN_CTX": CHUNKS_COUNT_IN_CTX,
    "EMBEDDING_ID": EMBEDDING_ID,
    "DYNAMIC_CHUNKS_COUNT": len(dynamic_chunks_most_similar),
}

# Prompt builder

In [None]:
PROMPT_METHOD ="IMPERSONATE"
SUBJECT = "airidas"
RETRIAVAL_METHODS = ["static"]#, "hybrid", "dynamic", 
NUM_RUNS = 3
# MODEL = "llama3-70b"
# MODEL = "llama3-8b"
MODEL = "mixtral-8x22b"
# max_tokens = [7500]
max_tokens = [0, 4000, 7500]

survs = [survey.KanoSurvey(), survey.PersonalitySurvey()]
for surv in survs:
    if isinstance(surv, survey.KanoSurvey):
        DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
        PROMPT_COUNT = 40
        SURVEY_TYPE = "KanoSurvey",
        WHICH_SURVEY = "kano"
        RETRIEVAL_PROMPT = "video game features"
        SURVEY = "video game preferences"
        METHOD = "a Kano survey"
    elif isinstance(surv, survey.PersonalitySurvey):
        DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
        PROMPT_COUNT = 50
        SURVEY_TYPE = "PersonalitySurvey",
        WHICH_SURVEY = "pers"
        RETRIEVAL_PROMPT = "openess conciousness extrovert aggreableness neuroticism"
        SURVEY = "personality traits"
        METHOD = "an OCEAN test"

    for max_token in max_tokens:
        for RETRIAVAL_METHOD in RETRIAVAL_METHODS:
            if RETRIAVAL_METHOD == "dynamic":
                dynamic_retrieval_prompts = list(surv.questions)
                dynamic_chunks_most_similar: List[List[str]] = [] 
                progress = 0
                lenn = len(dynamic_retrieval_prompts)

                # Dynamic Retrieval
                for prompt in dynamic_retrieval_prompts:
                    progress += 1
                    print(f"\rPrompt {progress}/{lenn}", end="")

                    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]

                    ## Chunking and Cosine Similarity
                    max_chunks_count = int((max_token / avg_chunk_token_count))
                    chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)
                    chunks_most_similar = [chunks[embedding[1]] for embedding in chunks_most_similar_embeddings]

                    # Limit chunks by tokens
                    if max_token == 0:
                        chunks_most_similar = chunks_most_similar[:1]
                    else:
                        # Limit chunks by man token count
                        cur_tc = 0 # current token count
                        selected_chunks = []
                        for chunk in chunks_most_similar:
                            tk_in_chunk = utils.count_tokens(chunk)
                            if cur_tc + tk_in_chunk >= max_token:
                                break
                            cur_tc += tk_in_chunk
                            selected_chunks.append(chunk)
                        chunks_most_similar = selected_chunks
                    
                    # Finalize chunks
                    tokens_in_chunks = sum(utils.count_tokens(chunk) for chunk in chunks_most_similar)
                    dynamic_chunks_most_similar.append(chunks_most_similar)
                print(end="\n")

                # Count total tokens in all chunks
                tokens_in_chunks = 0
                for chunks_most_similar in dynamic_chunks_most_similar:
                    for chunk in chunks_most_similar:
                        tokens_in_chunks += utils.count_tokens(chunk)
                # del chunks_most_similar_embeddings  # free memory
                print(f"Tokens in average chunk group: {tokens_in_chunks/len(dynamic_chunks_most_similar)}")
                final_prompts = []
                prompt_template = """
for question, chunks_most_similar in zip(surv.questions, dynamic_chunks_most_similar):
    p = [
        systemMsg("\\n".join([
            f"You are an expert actor, specializing in impersonation of non-famous people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit {SURVEY} by shadowing chats between the subject and friends. You will be asked to fully immerse yourself in the role, and answer questions from the point of view of the persona. \\n#Context \\n##Chat conversations between the subject and their friends:\\n",
            "\\n\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
        ])),     
        assistantMsg("Understood. I will answer from the point of view of the persona, based on what I could the deduct from the text provided."),
        userMsg("\\n".join([
            f"Persona is questioned about their {SURVEY} in {METHOD}. The persona must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction.\\n\\n**Your question is:**\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
                """

            elif RETRIAVAL_METHOD == "static":
                prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
                chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)
                chunks_most_similar = [chunks[embedding[1]] for embedding in chunks_most_similar_embeddings]
                ## Chunking
                if max_token == 0:
                    chunks_most_similar = chunks_most_similar[:1]
                else:
                    # Limit chunks by man token count
                    cur_tc = 0 # current token count
                    selected_chunks = []
                    for chunk in chunks_most_similar:
                        tk_in_chunk = utils.count_tokens(chunk)
                        if cur_tc + tk_in_chunk >= max_token:
                            break
                        cur_tc += tk_in_chunk
                        selected_chunks.append(chunk)
                    chunks_most_similar = selected_chunks
                
                # Count total tokens in all chunks
                tokens_in_chunks = sum(utils.count_tokens(chunk) for chunk in chunks_most_similar)
                # static_chunks_most_similar.append(chunks_most_similar)
                print(f"Tokens in average chunk group: {tokens_in_chunks/len(chunks_most_similar)}")

                del chunks_most_similar_embeddings # free memory
                #Retrieve^
                final_prompts = []
                prompt_template = """
for question in surv.questions:
    p = [
        systemMsg("\\n".join([
            f"You are an expert actor, specializing in impersonation of non-famous people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit {SURVEY} by shadowing chats between the subject and friends. You will be asked to fully immerse yourself in the role, and answer questions from the point of view of the persona. \\n#Context \\n##Chat conversations between the subject and their friends:\\n",
            "\\n\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
        ])),     
        assistantMsg("Understood. I will answer from the point of view of the persona, based on what I could the deduct from the text provided."),
        userMsg("\\n".join([
            f"Persona is questioned about their {SURVEY} in {METHOD}. The persona must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction.\\n\\n**Your question is:**\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
                """
            else: raise ValueError(f"Unknown retrieval method: {RETRIAVAL_METHOD}")
            exec(prompt_template)
             # Save prompts
            SIM_ID = f"{SUBJECT}-{WHICH_SURVEY}-{RETRIAVAL_METHOD}-{max_token}_{MODEL}_V8"
            bu.quickJSON(final_prompts, f"data/temp/{SIM_ID}_prompts.json")
            for num_run in range(NUM_RUNS):
                instructions = {
                    "prompt_file": f"batch/prompts/{SIM_ID}_prompts.json",
                    "survey_type": f"{SURVEY_TYPE[0]}",
                    "isLocal": True,
                    "LIMIT": None
                }
                settings = {
                    "model": MODEL,
                    "timeout": 300}
                AUTO_INFO = {
                    "CHUNK_SIZE": chunk_size,
                    "OVERLAP_SIZE": overlap_size,
                    "CTX_limit": max_token,
                    "chunk_count": len(chunks_most_similar),
                    "EMBED_MODEL": EMBED_MODEL,
                    "prompt method": PROMPT_METHOD,
                    "retrieval method": RETRIAVAL_METHOD,
                    "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
                    "prompt_count": PROMPT_COUNT,
                    "survey": WHICH_SURVEY,
                    "SUBJECT": SUBJECT,
                    "prompt_template": prompt_template,
                    # "CHUNKS_COUNT_IN_CTX": max_chunks_count,
                    **utils.describe_prompts(final_prompts)
                }
                bu.quickJSON({"instructions": instructions, "settings": settings, "info": AUTO_INFO}, f"data/temp/runs/{SIM_ID}_{num_run}.json")

### V6 & Hybrid legacy versions

In [None]:
#HYBRID

#             elif RETRIAVAL_METHOD == "hybrid":
#                 # Access global static chunks
#                 hybrid_chunks_most_similar: List[List[str]] = []

#                 # Retrieve chunks for RETRIEVAL_PROMPT
#                 prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
#                 global_static_chunks_most_similar = [chunks[embedding[1]] for embedding in utils.find_most_similar(prompt_embedding, embeddings)]
#                 # retrieval_prompt_chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)
#                 # static_chunks_most_similar = [chunks[embedding[1]] for embedding in retrieval_prompt_chunks_most_similar_embeddings]
#                 for chunk in global_static_chunks_most_similar:
#                     p.append(chunk)

#                 # Retrieve chunks for prompt
#                 prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
#                 prompt_chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)
#                 dynamic_chunks_most_similar = [chunks[embedding[1]] for embedding in prompt_chunks_most_similar_embeddings]

#                 if max_token == 0:
#                     p.append([static_chunks_most_similar[0], dynamic_chunks_most_similar[0]])  # Edge case: 1 chunk from each
                
#                 # Combine chunks while limiting by max_token
#                 combined_chunks = []
#                 cur_tc = 0  # current token count
#                 for chunk in global_static_chunks_most_similar + dynamic_chunks_most_similar:
#                     tk_in_chunk = utils.count_tokens(chunk)
#                     if cur_tc + tk_in_chunk >= max_token:
#                         break
#                     cur_tc += tk_in_chunk
#                     combined_chunks.append(chunk)

#                 hybrid_chunks_most_similar.append(combined_chunks)
#                 tokens_in_chunks = sum(utils.count_tokens(chunk) for chunk in combined_chunks)
#                 print(f"Tokens in average chunk group: {tokens_in_chunks/len(hybrid_chunks_most_similar)}")
#                 #Retrieve^
#                 # for question in surv.questions:
#                 #     p = [question]
#                 # for chunk in chunks_most_similar[i]:
#                 #     p.append(chunk)
#                 #     i += 1
#                 final_prompts = []
#                 prompt_template = """
# p = []
# for chunk in chunks_most_similar:
#     p.append(chunk)

#     p = [
#         systemMsg("\\n".join([
#             f"You are an expert actor, specializing in impersonation of non-famous people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit {SURVEY} by shadowing chats between the subject and friends. You will be asked to fully immerse yourself in the role, and answer questions from the point of view of the persona. \\n#Context \\n##Chat conversations between the subject and their friends:\\n",
#             "\\n\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
#         ])),     
#         assistantMsg("Understood. I will answer from the point of view of the persona, based on what I could the deduct from the text provided."),
#         userMsg("\\n".join([
#             f"Persona is questioned about their {SURVEY} in {METHOD}. The persona must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction.\\n\\n**Your question is:**\\n",
#             question,
#             "\\nThe persona chooses:"
#         ]))]
#     final_prompts.append(p)
#                 """

In [None]:
CHUNK_SIZES = [250]
MODELS = ["mixtral:8x22b-instruct-v0.1-q2_K"]
# OVERLAP_SIZES = [5]
overlap_size = 5
RETRIAVAL_METHODS = ["dynamic", "hybrid"]
max_tokens = 10000
for chunk_size in CHUNK_SIZES:
    for MODEL in MODELS:
        chunks = []
        chunk_token_counts = []
        for chat in et.selectedChats.values():
            messages = list(chat)  # Convert chat iterator to list for easier slicing
            num_messages = len(messages)
            for i in range(0, num_messages - chunk_size + 1, chunk_size - overlap_size):
                chunk = messages[i:i + chunk_size]  # Extract chunk of messages
                chunk_text = "\\n".join(str(msg) for msg in chunk)  # Concatenate msgs into a single string
                chunks.append(chunk_text)  # Append chunk to list of chunks
                chunk_token_counts.append(utils.count_tokens(chunk_text))  # Append token count of the chunk
        avg_chunk_token_count = sum(chunk_token_counts) / len(chunk_token_counts)
        embeddings = []
        progress, chunks_len = 0, len(chunks)
        for chunk_text in chunks:
            progress += 1
            print(f"\rChunk {progress}/{chunks_len}", end="")
            embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
            embeddings.append(embedding)
        for RETRIAVAL_METHOD in RETRIAVAL_METHODS:
            if RETRIAVAL_METHOD == "dynamic":
                dynamic_retrieval_prompts = list(surv.questions)
                dynamic_chunks_most_similar: List[List[str]] = [] 
                progress = 0
                lenn = len(dynamic_retrieval_prompts)
                for prompt in dynamic_retrieval_prompts:
                    progress += 1
                    print(f"\rPrompt {progress}/{lenn}", end="")
                    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
                    max_chunks_count = int((max_tokens / avg_chunk_token_count)-1)
                    chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:max_chunks_count]
                    chunks_most_similar = [chunks[embedding[1]] for embedding in chunks_most_similar_embeddings]
                    tokens_in_chunks = sum(utils.count_tokens(chunk) for chunk in chunks_most_similar)
                    dynamic_chunks_most_similar.append(chunks_most_similar)
                print(end="\n")
                tokens_in_chunks = 0
                for chunks_most_similar in dynamic_chunks_most_similar:
                    for chunk in chunks_most_similar:
                        tokens_in_chunks += utils.count_tokens(chunk)
                del chunks_most_similar_embeddings  # free memory
                print(f"Tokens in average chunk group: {tokens_in_chunks/len(dynamic_chunks_most_similar)}")
                final_prompts = []
                prompt_template = """
for question, chunks_most_similar in zip(surv.questions, dynamic_chunks_most_similar):
    p = [
        systemMsg("\\n".join([
            f"You are an expert actor, specializing in impersonation of non-famouns people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit {SURVEY} by shadowing chats between the subject and friends. You will be asked to fully immerse yourself in the role, and answer questions from the point of view of the persona. \\n#Context \\n##Chat conversations between the subject and their friends:\\n**From most to least related**\\n",
            "\\n\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
        ])),      
        assistantMsg("Understood. I will answer from the point of view of the persona, based on what I could the deduct from the text provided."),
        userMsg("\\n".join([
            f"Persona is questioned about their {SURVEY} in an {METHOD}. The persona must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction.\\n\\n**Your question is:**\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
                """
            elif RETRIAVAL_METHOD == "hybrid":
                prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
                max_chunks_count = int((max_tokens / avg_chunk_token_count))
                chunks_most_similar_embeddings_static = utils.find_most_similar(prompt_embedding, embeddings)[:max_chunks_count // 2]
                chunks_most_similar_static = [chunks[embedding[1]] for embedding in chunks_most_similar_embeddings_static]

                dynamic_retrieval_prompts = list(surv.questions)
                dynamic_chunks_most_similar: List[List[str]] = []
                progress = 0
                lenn = len(dynamic_retrieval_prompts)
                for prompt in dynamic_retrieval_prompts:
                    progress += 1
                    print(f"\rPrompt {progress}/{lenn}", end="")
                    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
                    chunks_most_similar_embeddings_dynamic = utils.find_most_similar(prompt_embedding, embeddings)[:max_chunks_count // 2]
                    chunks_most_similar_dynamic = [chunks[embedding[1]] for embedding in chunks_most_similar_embeddings_dynamic]
                    dynamic_chunks_most_similar.append(chunks_most_similar_dynamic)
                print(end="\n")

                chunks_most_similar = chunks_most_similar_static + [chunk for sublist in dynamic_chunks_most_similar for chunk in sublist]
                tokens_in_chunks = sum(utils.count_tokens(chunk) for chunk in chunks_most_similar)
                del chunks_most_similar_embeddings_static, chunks_most_similar_embeddings_dynamic  # free memory

                print(f"Tokens in average chunk group: {tokens_in_chunks / len(chunks_most_similar)}")
                final_prompts = []
                prompt_template = """
for question, chunks_most_similar_dynamic in zip(surv.questions, dynamic_chunks_most_similar):
    p = [
        systemMsg("\\n".join([
            f"You are an expert actor, specializing in impersonation of non-famouns people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit {SURVEY} by shadowing chats between the subject and friends. You will be asked to fully immerse yourself in the role, and answer questions from the point of view of the persona. \\n#Context \\n##Chat conversations between the subject and their friends:\\n**From most to least related**\\n",
            "\\n\\nNEW CONVERSATION RELATED TO THE SURVEY OVERALL:\\n".join(chunks_most_similar_static),
            "\\n\\nNEW CONVERSATION RELATED TO THE PARTICULAR QUESTION:\\n".join(chunks_most_similar_dynamic)
        ])),      
        assistantMsg("Understood. I will answer from the point of view of the persona, based on what I could the deduct from the text provided."),
        userMsg("\\n".join([
            f"Persona is questioned about their {SURVEY} in an {METHOD}. The persona must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction.\\n\\n**Your question is:**\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
                """
            else: print("neither hybrid, nor dynamic")
            exec(prompt_template)
            prompt_info = utils.describe_prompts_and_print(final_prompts) # Vanity print
            SIM_ID = f"{SUBJECT}-{WHICH_SURVEY}-{RETRIAVAL_METHOD}-{chunk_size}-{str(overlap_size).zfill(2)}-{str(max_chunks_count).zfill(2)}_{MODEL}_V6"
            bu.quickJSON(final_prompts, f"data/5_monster_prep/{SIM_ID}_prompts.json")
            instructions = {
                "prompt_file": f"batch/prompts/{SIM_ID}_prompts.json",
                "survey_type": f"{SURVEY_TYPE[0]}",
                "isLocal": True,
                "LIMIT": None
            }
            settings = {
                "model": MODEL,
                "timeout": 300}
            AUTO_INFO = {
                "CHUNK_SIZE": chunk_size,
                "OVERLAP_SIZE": overlap_size,
                "CHUNKS_COUNT_IN_CTX": max_chunks_count,#chunks_count_in_ctx,
                "CTX_limit": max_tokens,
                "tokens_in_chunks": tokens_in_chunks,
                "model": EMBED_MODEL,
                "prompt method": PROMPT_METHOD,
                "retrieval method": RETRIAVAL_METHOD,
                "retrieval prompt": RETRIEVAL_PROMPT,
                "prompt_count": PROMPT_COUNT,
                "survey": WHICH_SURVEY,
                "subject": SUBJECT,
                "prompt_template": prompt_template,
                **prompt_info,
                **utils.describe_prompts([])
            }
            bu.quickJSON({"instructions": instructions, "settings": settings, "info": AUTO_INFO}, f"data/5_monster_prep/batch-schema/{SIM_ID}_schema.json")

## Cleaning above

### Dynamic

In [None]:
final_prompts = []

prompt_template = """
for question, chunks_most_similar in zip(surv.questions, dynamic_chunks_most_similar):
    p = [
        systemMsg("\\n".join([
            f"{SYS_MSG['content']}",
            "\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
        ])),      
        assistantMsg(ASSIST_MSG['content']),
        userMsg("\\n".join([
            f"{USER_MSG['content']}\\n\\n**Your question is:**\\n\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
"""    
exec(prompt_template)
prompt_info = utils.describe_prompts_and_print(final_prompts)
bu.quickJSON(final_prompts, f"data/5_prep/{PREP_CHECKPOINT}_prompts.json")
print(f"{len(final_prompts)}")#,{final_prompts[:1]}")

### Static

In [None]:
final_prompts = []
prompt_template = """
for question in surv.questions:
    p = [
        systemMsg("\\n".join([
            f"{SYS_MSG['content']}",
            "\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
        ])),  
        assistantMsg(ASSIST_MSG['content']),
        userMsg("\\n".join([
            f"{USER_MSG['content']}\\n\\n**Your question is:**\\n\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
"""
exec(prompt_template)
prompt_info = utils.describe_prompts_and_print(final_prompts)
bu.quickJSON(final_prompts, f"data/5_prep/{PREP_CHECKPOINT}_prompts.json")
print(f"{len(final_prompts)}")#,{final_prompts[:1]}")

### Base (no persona)

In [None]:
final_prompts = []
prompt_template = """
for question in surv.questions:
    p = [
        systemMsg(
            "You are participating in a survey. You will be presented with a series of questions about your {SURVEY}.",
            f"You must choose answer to the question below with one of the five options: {', '.join(surv.POSSIBLE_ANSWERS)}. The answer must only contain the chosen option. "
        ),
        assistantMsg('Understood. I will answer the question below with one of the given options.'),
        userMsg(
            question,
            "Your choice: "
        )]
    final_prompts.append(p)
"""
exec(prompt_template)
prompt_info = utils.describe_prompts_and_print(final_prompts) # Vanity print
bu.quickJSON(final_prompts, f"data/5_prep/{WHICH_SURVEY}_base_prompts.json")

In [None]:
MODEL = "llama3"

instructions = {
    "prompt_file": f"batch/prompts/{WHICH_SURVEY}_base_prompt.json",
    "survey_type": f"{SURVEY_TYPE}",
    "isLocal": True,
    "LIMIT": None
}
settings = {
    "model": MODEL,
    "timeout": 300
}
AUTO_INFO = {
    "survey": WHICH_SURVEY,
    "prompt_template": prompt_template,
    **utils.describe_prompts([])
    }
bu.quickJSON({"instructions": instructions, "settings": settings, "info": AUTO_INFO}, f"data/5_prep/{WHICH_SURVEY}_base_batch-schema.json")