# Setup

## import

In [None]:
# Standard library imports
import importlib
import os
import re
import time
import json
from typing import List 
import numpy as np
from numpy.linalg import norm
import pandas as pd
import ollama
from openai import OpenAI

# Local imports from tools package
from tools import brikasutils as bu
from tools import shared_utils as utils
from tools.shared_utils import systemMsg, userMsg, assistantMsg
from tools import survey
from tools import persona

# Reload modules
importlib.reload(bu)
importlib.reload(utils)
importlib.reload(survey)
importlib.reload(persona)

## personal data

In [None]:
et = persona.PersonaEncoder()

path = 'data/01_chats/'
JsonChatsFB = {
    "x": [path + 'x'],
    "y": [path + 'y'],
    "z": [path + 'z']
}

################ ROUGH PREP ####################
for name, texts in JsonChatsFB.items():
    et.parse_fb_messages(texts, name)

#filter
et.filter_chats_empty()
et.filter_chats_regex(utils.BLACKLIST_CHAT_REGEX_FILTERS)

#Pseudonomize
for nameid, chat in et.chats.items():
    for msg in chat:  
        msg.sender = "Persona" if msg.sender == "Elias Salvador Smidt Torjani"  else "Friend"
################ ROUGH PREP ####################

for name in JsonChatsFB.keys():
    et.select_chat_full(name)

##subset/mask data to handle too old chats or dominating friend[s]##
# select_chat_limited_by_tokens("elias", 6000)
# if single ^ friend, or if more below.
# for nameid, chat in et.chats.items():
#     et.chats[nameid] = chat[int(len(chat)/3 * 2):]

token_counts = et.count_all_selected_chat_tokens() # for statistics
print(f"Combined tokens: {sum(token_counts.values())}")

#Optional checkpoint save
# 01=et.output()
# bu.quickTXT(01, filename=f"data/checkpoints/01_{bu.get_timestamp()}")

## creating knowledge base

### settings 

In [1]:
EMBED_MODEL = "nomic-embed-text" # gte-Qwen2-7B-instruct-Q5_K_M-GGUF-8k
#TODO Handle cases where messages are too far time apart, 
#TODO Avoid having mesages from multiple people in one chunk.

# Chunking parameters
chunkSize = 40     # N of msgs per chunk: 10-90?
overlapSize = 10   # N of overlapping msgs between consecutive chunks: 5-50?

## generate multiple different chunk versions (e.g., for grid search) ##
# chunkSizes = [75]
# chunkSize = chunkSizes[0]
# overlapSizes = [3]
# overlapSize = overlapSizes[0]

### create chunks

In [None]:
# Initialize lists for storing chunks – and embeddings later
chunks = []
chunkTokenCounts = []

for chat in et.selectedChats.values():
    messages = list(chat)  # Convert chat iterator to list for easier slicing
    num_messages = len(messages)

    # Create overlapping chunks of messages
    for i in range(0, num_messages - chunkSize + 1, chunkSize - overlapSize):
        chunk = messages[i:i + chunkSize]  # Extract chunk of messages
        chunkText = "\n".join(str(msg) for msg in chunk)  # Concatenate messages into a single string
        chunks.append(chunkText)  # Append chunk to list of chunks
        chunkTokenCounts.append(utils.count_tokens(chunkText))
        #TODO This is where we should handle chunks more intelligently ^.

avgChunkTokenCount = sum(chunkTokenCounts) / len(chunkTokenCounts)
embeddings = []
progress, chunksLen = 0, len(chunks)

### create embeddings

In [None]:
for chunkText in chunks:
    progress += 1
    print(f"\rChunk {progress}/{chunksLen}", end="")
    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunkText)["embedding"]
    embeddings.append(embedding)


#optional extra info
# total_messages = sum(len(chat) for chat in et.selectedChats.values())
# chunks_count = len(chunks)
# avg_chunk_char_len = np.mean([len(chunk) for chunk in chunks])

# print(
#     f"Chunk count: {chunks_count}",
#     # f"Average chunk character length: {round( avg_chunk_char_len)}",
#     f"Rough estimate of tokens per chunk: {round(avg_chunk_char_len / 4)} (4 characters per token)",
#     f"Messagees in input count: {total_messages}",
#     f"Messages in chunks count: {stat_total_msgs_in_chunks}",
#     f"Chunk \ Input ratio: {round(stat_total_msgs_in_chunks / total_messages,2)} (OVERLAP_SIZE={OVERLAP_SIZE})",
#     f"Chunk Python type: {type(chunks[0])}",
#     sep="\n"
# ) 

### **Survey** ################

In [None]:
# surv = survey.KanoSurvey()
surv = survey.PersonalitySurvey()
#The Five Factors of personality are:
# Openness - How open a person is to new ideas and experiences
# Conscientiousness - How goal-directed, persistent, and organized a person is
# Extraversion - How much a person is energized by the outside world
# Agreeableness - How much a person puts others' interests and needs ahead of their own
# Neuroticism - How sensitive a person is to stress and negative emotional triggers

# surv = survey.buildFairnessPrompts()
# surv = survey.DictatorGameSurvey()
surv.questions[:2]

___

In [None]:
#TODO, there are many versions rn.

chunks = []
chunk_token_counts = []
for chat in et.selectedChats.values():
    messages = list(chat)  # Convert chat iterator to list for easier slicing
    num_messages = len(messages)
    for i in range(0, num_messages - chunkSize + 1, chunkSize - overlapSize):
        chunk = messages[i:i + chunkSize]  # Extract chunk of messages
        chunk_text = "\\n".join(str(msg) for msg in chunk)  # Concatenate msgs into a single string
        chunks.append(chunk_text)  # Append chunk to list of chunks
        chunk_token_counts.append(utils.count_tokens(chunk_text))  # Append token count of the chunk

avg_chunk_token_count = sum(chunk_token_counts) / len(chunk_token_counts)
embeddings = []
progress, chunks_len = 0, len(chunks)
for chunk_text in chunks:
    progress += 1
    print(f"\rChunk {progress}/{chunks_len}", end="")
    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
    embeddings.append(embedding)

EMBEDDING_NAMEID = f"03_{bu.get_timestamp()}"
AUTO_INFO = {
    "model": EMBED_MODEL,
    "CHUNK_SIZE": chunkSize,
    "OVERLAP_SIZE": overlapSize,
    "chunks_count": len(chunks),
    "modules_chat": token_counts,
    "overlap_size": overlapSize,
}# bu.quickTXT(01, filename=f"data/checkpoints/01_{bu.get_timestamp()}")

bu.quickJSON(AUTO_INFO, f"data/03_embeddings/{EMBEDDING_NAMEID}_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/03_embeddings/{EMBEDDING_NAMEID}_embeddings.json")


if isinstance(surv, survey.KanoSurvey):
    RETRIEVAL_PROMPT = "video game features"
    DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
    SURVEY_TYPE = "KanoSurvey",
    SURVEY = "video game preferences"
    METHOD = "Kano survey"
    WHICH_SURVEY = "kano"
    PROMPT_LENGTH = 40
elif isinstance(surv, survey.PersonalitySurvey):
    RETRIEVAL_PROMPT = "openess conciousness extrovert aggreableness neuroticism"
    DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
    SURVEY_TYPE = "PersonalitySurvey",
    SURVEY = "personality traits"
    METHOD = "OCEAN test"
    WHICH_SURVEY = "pers"
    PROMPT_LENGTH = 50

CHUNKS_COUNT_IN_CTX = 10 # Number of nearby chunks to put in context window

########### Serialization ###########
# EMBEDDING_ID = f"{CHUNK_SIZE}-{OVERLAP_SIZE}"
VERSION_ID = f"8k_{WHICH_SURVEY}" # pers/kano_{ctx tokens}k
CHECKPOINT = f"{EMBEDDING_ID}-{CHUNKS_COUNT_IN_CTX}-{VERSION_ID}"
AUTO_INFO = {
    "CHUNKS_COUNT_IN_CTX": CHUNKS_COUNT_IN_CTX,
    "EMBEDDING_ID": EMBEDDING_ID,
    "VERSION_ID": VERSION_ID,
    "model": EMBED_MODEL,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "chunks_count": chunks_count,
    "total_messages": total_messages,
    "stat_total_msgs_in_chunks": stat_total_msgs_in_chunks,
    "modules_chat": token_counts,
    "SURVEY and method": f"{SURVEY} and {METHOD}",
    "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
    "DYNAMIC_RETRIEVAL_PROMPTS": DYNAMIC_RETRIEVAL_PROMPTS,
}
########### Serialization ###########


#/



########### Serialization ###########
EMBEDDING_ID = f"{chunkSize}-{overlapSize}"
AUTO_INFO = {
    "model": EMBED_MODEL,
    "CHUNK_SIZE": chunkSize,
    "OVERLAP_SIZE": overlapSize,
    "chunks_count": chunksCount,
    "total_messages": totalMsgs,
    "stat_total_msgs_in_chunks": stat_total_msgs_in_chunks,
    "modules_chat": token_counts,
}

# Generate embeddings for each chunk
embeddings = []
progress, chunks_len = 0, len(chunks) # for progress bar
for chunk_text in chunks:
    progress += 1
    print(f"\rChunk {progress}/{chunks_len}", end="")
    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
    embeddings.append(embedding)
####################################################
# token counts in all similar chunks
# tokens_in_chunks = 0
# for chunk in chunks_most_similar:
#     tokens_in_chunks += utils.count_tokens(chunk)
# print(f"Tokens in chunks: {tokens_in_chunks}")
####################################################
bu.if_dir_not_exist_make("data/03_embeddings")
bu.quickJSON(AUTO_INFO, f"data/03_embeddings/{EMBEDDING_ID}_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/03_embeddings/{EMBEDDING_ID}_embeddings.json")



#/



PROMPT_METHOD ="IMPERSONATE"
SUBJECT = "eli"
RETRIAVAL_METHODS = ["static"]  #, "hybrid", "dynamic", 
NUM_RUNS = 3
MODEL = "llama3-70b"            # "llama3-8b", "mixtral-8x22b"
max_tokens = [0, 4000, 7500]



## Base

In [None]:
# MODEL = "llama3-70b"
# MODEL = "llama3-8b"
MODEL = "mixtral-8x22b"
RETRIAVAL_METHODS = ["base"]#, "hybrid"]
SUBJECT = "base"
PROMPT_METHOD ="IMPERSONATE"
NUM_RUNS = 3

CHUNK_SIZES = [75]
chunk_size = CHUNK_SIZES[0]
OVERLAP_SIZES = [3]
overlap_size = OVERLAP_SIZES[0]

survs = [survey.KanoSurvey(), survey.PersonalitySurvey()]
for surv in survs:
    if isinstance(surv, survey.KanoSurvey):
        DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
        PROMPT_COUNT = 40
        SURVEY_TYPE = "KanoSurvey",
        WHICH_SURVEY = "kano"
        RETRIEVAL_PROMPT = "video game features"
        SURVEY = "video game preferences"
        METHOD = "a Kano survey"
    elif isinstance(surv, survey.PersonalitySurvey):
        DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
        PROMPT_COUNT = 50
        SURVEY_TYPE = "PersonalitySurvey",
        WHICH_SURVEY = "pers"
        RETRIEVAL_PROMPT = "openess conciousness extrovert aggreableness neuroticism"
        SURVEY = "personality traits"
        METHOD = "an OCEAN test"
    for RETRIAVAL_METHOD in RETRIAVAL_METHODS:
        if RETRIAVAL_METHOD == "base":
            final_prompts = []
            prompt_template = """
for question in surv.questions:
    p = [
        systemMsg(
            "You are participating in a survey. You will be presented with a series of questions about your {SURVEY}.",
            f"You must choose answer to the question below with one of the five options: {', '.join(surv.POSSIBLE_ANSWERS)}. The answer must only contain the chosen option. "
        ),
        assistantMsg('Understood. I will answer the question below with one of the given options.'),
        userMsg(
            question,
            "Your choice: "
        )]
    final_prompts.append(p)
            """
        else: print("not ???")
        exec(prompt_template)
        SIM_ID = f"{SUBJECT}-{WHICH_SURVEY}-{str(max_chunks_count).zfill(2)}_{MODEL}_V7"
        bu.quickJSON(final_prompts, f"data/5_monster_prep/{SIM_ID}_prompts.json")
        for num_run in range(NUM_RUNS):
            instructions = {
                "prompt_file": f"batch/prompts/{SIM_ID}_prompts.json",
                "survey_type": f"{SURVEY_TYPE[0]}",
                "isLocal": True,
                "LIMIT": None
            }
            settings = {
                "model": MODEL,
                "timeout": 300}
            AUTO_INFO = {
                "CHUNK_SIZE": chunk_size,
                "OVERLAP_SIZE": overlap_size,
                "CTX_limit": max_token,
                "EMBED_MODEL": EMBED_MODEL,
                "prompt method": PROMPT_METHOD,
                "retrieval method": RETRIAVAL_METHOD,
                "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
                "prompt_count": PROMPT_COUNT,
                "survey": WHICH_SURVEY,
                "SUBJECT": SUBJECT,
                "prompt_template": prompt_template,
                "CHUNKS_COUNT_IN_CTX": max_chunks_count,
                **utils.describe_prompts(final_prompts)
            }
            bu.quickJSON({"instructions": instructions, "settings": settings, "info": AUTO_INFO}, f"data/5_monster_prep/batch/{SIM_ID}_{num_run}.json")

## Round up

In [None]:
chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)

max_tokens = 8000
cur_tc = 0 # current token count
selected_chunks = []
for chunk in chunks_most_similar:
    tk_in_chunk = utils.count_tokens(chunk)
    if cur_tc + tk_in_chunk >= max_tokens:
        break
    cur_tc += tk_in_chunk
    selected_chunks.append(chunk)
print(f"Tokens in chunks: {cur_tc}")
len(selected_chunks)


#/



# Display results
# bu.if_dir_not_exist_make("data/4_chunks")
# bu.quickJSON(AUTO_INFO, f"data/4_chunks/{CHECKPOINT}-dynamic_info.json")
# bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/4_chunks/{CHECKPOINT}-dynamic_embeddings.json")
############################################ VANITY BELOW ########################################
tokens_in_chunks = 0
for chunks_most_similar in dynamic_chunks_most_similar:
    for chunk in chunks_most_similar:
        tokens_in_chunks += utils.count_tokens(chunk)

del chunks_most_similar_embeddings # free memory
print(f"Tokens in average chunk group: {tokens_in_chunks/len(dynamic_chunks_most_similar)}")
# bu.quickJSON(dynamic_chunks_most_similar, filename=f"data/4_chunks/{CHECKPOINT}-dynamic_chunks.json")

In [None]:
import os
import json

# Set the directory where the JSON files are located
directory = 'batch/done/monster_7b'  # Replace with the actual directory path if needed

# Loop through all files in the directory
for filename in os.listdir(directory):
    # Check if the file has a .json extension
    if filename.endswith('.json'):
        # Open the JSON file
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as f:
            data = json.load(f)

        # Modify the "model" value
        if "settings" in data and "model" in data["settings"]:
            data["settings"]["model"] = "llama3"

        # Write the modified data back to the file
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2)

## SHIT

In [None]:
chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)

In [None]:
max_tokens = 8000
cur_tc = 0 # current token count
selected_chunks = []
for chunk in chunks_most_similar:
    tk_in_chunk = utils.count_tokens(chunk)
    if cur_tc + tk_in_chunk >= max_tokens:
        break
    cur_tc += tk_in_chunk
    selected_chunks.append(chunk)
print(f"Tokens in chunks: {cur_tc}")
len(selected_chunks)