# Enclave ID 

This notebook is solely for testing the inputs and outputs of a candidate workflow. Some components will either be completely changed or improved soon.

In [None]:
from gradio_client import Client
import json
import json_repair
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

In [None]:
LLAMA2_7B_CLIENT = Client("huggingface-projects/llama-2-7b-chat")
MISTRAL_7B_CLIENT = Client("hysts/mistral-7b")


def ask_llama2_7b(prompt, 
                  system_prompt="",
                  max_new_tokens=1024, 
                  temperature=0.6,
                  top_p=0.9,
                  top_k=50,
                  repetition_penalty=1.2):
    """
    It generates a response using the Llama2 7B HuggingFace Space as API
    """
    response = LLAMA2_7B_CLIENT.predict(
        prompt, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty, api_name="/chat" 
    )
    return response


def ask_mistral_7b(
      prompt, 
      max_new_tokens=1024, 
      temperature=0.6,
      top_p=0.9,
      top_k=50,
      repetition_penalty=1.2):
    """
    It generates a response using the Mistral 7B HuggingFace unofficial space as API
    """
    response = MISTRAL_7B_CLIENT.predict(
        prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty, api_name="/chat"
    )
    return response


def extract_json(gpt_answer):
    text = gpt_answer.replace("\\n", "\n")
    start_index = text.find("{")
    end_index = text.rfind("}")

    json_text = text
    if start_index != -1 and end_index != -1 and start_index < end_index:
        json_text = text[start_index : end_index + 1]

    json_response = json_repair.loads(json_text)
    return json_response    

## Generated generic markers using GPT-4 

These pro/cons markers are supposed to be generic as an indication of what GPT should look at to score the OCEAN traits. 

In [None]:
# This is not finetuned. Only a sample
markers = {
    "openness": {
        "positive": "Searches and conversations about diverse topics, cultural interests, artistic activities, and philosophical discussions.",
        "negative": "Limited variety in search topics, avoidance of abstract or theoretical discussions in conversations."
    },
    "conscientiousness": {
        "positive": "Searches related to planning, organization, and goal-setting. Conversations about personal achievements, detailed planning, and adherence to schedules.",
        "negative": "Disorganized or last-minute search patterns, conversations indicating procrastination or disinterest in planning."
    },
    "extraversion": {
        "positive": "Online engagement with social events, searches about networking opportunities. Conversations that are energetic, involve many social topics, or planning social events.",
        "negative": "Limited searches about social activities, conversations that are short, infrequent, or reveal a preference for solitude."
    },
    "agreeableness": {
        "positive": "Searches about volunteering, social causes, or how to help others. Conversations that are empathetic, understanding, and cooperative.",
        "negative": "Searches indicating conflict, hostility, or self-centered interests. Conversations that are argumentative, lack empathy, or are overly competitive."
    },
    "neuroticism": {
        "positive": "Lack of excessive searches about worries, fears, or health anxieties. Balanced and calm nature in conversations.",
        "negative": "Frequent searches about health anxieties, fears, or negative outcomes. Conversations that are often worried, stressed, or pessimistic."
    },
}

## Handle data input in a way we can select date ranges

In [None]:
import csv
from datetime import datetime
from collections import defaultdict
from sortedcontainers import SortedList
import os

class Conversation:
    def __init__(self, date, messages):
        self.date = datetime.strptime(date, "%Y-%m-%d")
        self.messages = messages

    def __lt__(self, other):
        return self.date < other.date

    def __repr__(self):
        return f"Conversation(date='{self.date}', messages={self.messages})"

class SearchHistory:
    def __init__(self, date, searches):
        self.date = datetime.strptime(date, "%Y-%m-%d")
        self.searches = searches 

    def __lt__(self, other):
        return self.date < other.date

    def __repr__(self):
        return f"SearchHistory(date='{self.date}', searches={self.searches})"
        

class DataManager:
    def __init__(self):
        self.conversations = SortedList()
        self.search_history = SortedList()

    def add_conversation(self, conversation):
        self.conversations.add(conversation)

    def add_search_history(self, search_history):
        self.search_history.add(search_history) 

    def query_conversations(self, start_date, end_date):
        start_datetime = datetime.strptime(start_date, "%Y-%m-%d")
        end_datetime = datetime.strptime(end_date, "%Y-%m-%d")
        return [conv for conv in self.conversations if start_datetime <= conv.date <= end_datetime]

    def query_search_history(self, start_date, end_date):
        start_datetime = datetime.strptime(start_date, "%Y-%m-%d")
        end_datetime = datetime.strptime(end_date, "%Y-%m-%d")
        return [hist for hist in self.search_history if start_datetime <= hist.date <= end_datetime]
        

def read_conversation_csv(file_path, as_str=False):
    grouped_conversations = defaultdict(list)
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            grouped_conversations[row['date']].append({
                'sender_name': row['sender_name'],
                'content': row['content'],
                'time': row['time']
            })

    return [Conversation(date, messages) for date, messages in grouped_conversations.items()]

def read_search_history_csv(file_path, date):
    grouped_search_histories = defaultdict(list)
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            grouped_search_histories[date].append({
                'hour': row['hour'],
                'title': row['title']
            })

    return [SearchHistory(date, searches) for date, searches in grouped_search_histories.items()]

In [None]:
# use a couple of files for testing
data_manager = DataManager()

conversation_data = read_conversation_csv('/home/betogaona7/almendra/Sensity/enclave-llm/assets/conversations_samples/Nicolò Magnante.csv')
for conv in conversation_data:
    data_manager.add_conversation(conv)

search_history_data = read_search_history_csv("/home/betogaona7/almendra/Sensity/enclave-llm/assets/search_history_samples/2020-02/2020-02-07.csv", "2023-11-22")
for hist in search_history_data:
    data_manager.add_search_history(hist)

In [None]:
start_date = '2023-02-22'
end_date = '2023-12-31'
conversations = data_manager.query_conversations(start_date, end_date)
search_history = data_manager.query_search_history(start_date, end_date)

print(f"Conversations between {start_date} and {end_date}")
for day_conv in conversations:
    print(day_conv)
    print("\n")

print(f"Search history between {start_date} and {end_date}")
for day_search in search_history:
    print(day_search)
    print("\n")

In [None]:
def get_file_paths(dir_path):
    csv_paths = []
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            if file.endswith('.csv'):
                csv_paths.append(os.path.join(root, file))
    return csv_paths

In [None]:
# add all data
data_manager = DataManager()

conversation_data_dir_path = "/home/betogaona7/almendra/Sensity/enclave-llm/assets/conversations_samples"
search_history_data_dir_path = "/home/betogaona7/almendra/Sensity/enclave-llm/assets/search_history_samples"


conversation_files =  get_file_paths(conversation_data_dir_path) 
search_history_files = get_file_paths(search_history_data_dir_path)

for file_path in conversation_files:
    file_data = read_conversation_csv(file_path)
    for message in file_data:
        data_manager.add_conversation(conv)

for file_path in search_history_files:
    search_date = os.path.splitext(os.path.basename(file_path))[0]
    file_data = read_search_history_csv(file_path, search_date)
    for search in file_data:
        data_manager.add_search_history(search)

In [None]:
start_date = '2023-04-01'
end_date = '2023-11-30'
conversations = data_manager.query_conversations(start_date, end_date)
search_history = data_manager.query_search_history(start_date, end_date)

print(f"Conversations between {start_date} and {end_date}: {len(conversations)}")
#for day_conv in conversations:
    #print(day_conv)
    #print("\n")

print(f"Search history between {start_date} and {end_date}: {len(search_history)}")
#for day_search in search_history:
    #print(day_search)
    #print("\n")

## Prepare for chunking

Since the items in the `conversations` and `search_history` lists are of the `Conversation` and `SearchHistory` types, respectively, we have the flexibility to construct the text strings we wish to use as data, selecting from the available fields as needed. (Also, make some indirect data cleaning, ex. not adding the date or time to each message/search content)

However, some of these items can be too large, so we need to divide them into smaller segments. Since the goal for this notebook is only generate inputs and outputs for testing a workflow, I will select only those conversation and search history items that fall below a specified token threshold. Will define the chunking strategy later

In [None]:
# convert Conversation items in a string with format sender_name: content

conversations_str = []
for conversation in conversations: 
    conv_str = ""
    for message in conversation.messages:
        conv_str += f"{message['sender_name']}: {message['content']} \n"
    conversations_str.append(conv_str)

In [None]:
print(conversations_str[10])

In [None]:
# convert SearchHistory items in a string with format: For the date: date. The user history is: searches
searches_str = []
for search_history_item in search_history: 
    search_str = f"For the date {search_history_item.date}. The user history is: \n"
    for search in search_history_item.searches:
        search_str += f"{search['title']} \n"
    searches_str.append(search_str)

In [None]:
print(searches_str[10])

In [None]:
import tiktoken

encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-16k')
encoding

In [None]:
def _get_number_of_tokens(input):
    # encoding for gpt-4, gpt-3.5
    # functionality can be verified by copying the text here: https://platform.openai.com/tokenizer
    tokens = tiktoken.get_encoding("cl100k_base").encode(input)
    return len(tokens) 

In [None]:
tokens = _get_number_of_tokens(searches_str[10])
tokens

In [None]:
tokens_threshold = 1024

print(f"before - conversations: {len(conversations_str)} searches: {len(searches_str)}")

filtered_conversations = [conv for conv in conversations_str if _get_number_of_tokens(conv) <= tokens_threshold]
filtered_searches = [search for search in searches_str if _get_number_of_tokens(search) <= tokens_threshold]

print(f"after - conversations: {len(filtered_conversations)} searches: {len(filtered_searches)}")

## Classify each conversation or search history

In [None]:
labeled_data = {}
for idx, chunk in enumerate(filtered_searches[:5]):
    # Note that the prompt is not fine-tuned. Our aim is to simulate receiving an output for now, almost like mocking a unit test.
    # this component will be update later
    prompt = f"""
    Please analyze the given search history and classify the associated OCEAN personality traits \
    (Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism) based on predefined markers. 
    
    Assign each trait a level: high, medium, low, or none.
    
    Search History:
    {chunk}
    
    Markers for OCEAN Traits:
    {markers}
    
    Your task is to evaluate the search history against these markers and classify the level of each \
    OCEAN trait. 
    
    Format your response as a JSON object, using the trait names as keys and the assigned levels as values. 
    
    Expected JSON Output Format:
    {{
        "openness": "high/medium/low/none",
        "conscientiousness": "high/medium/low/none",
        "extraversion": "high/medium/low/none",
        "agreeableness": "high/medium/low/none",
        "neuroticism": "high/medium/low/none"
    }}
    
    Note:
    - Replace "high/medium/low/none" with the appropriate level based on your analysis.
    - Ensure that the response strictly adheres to the JSON format specified.
    """
    try:
        response = ask_llama2_7b(prompt=prompt)
        response = extract_json(response)
        print(response)
    except Exception as e: 
        print(f"llama failed to respond: {e}")
         
    labeled_data[idx] = response  

In [None]:
labeled_data

## Score according the labeled data 

We truncated the labeled data generation to include only five items, corresponding to the date range from 2023-04-01 to 2023-11-30.

Our goal is to create a score dictionary for each OCEAN trait using the labeled data that contains at least one trait labeled as high and to save these results. Following this, we will replicate the process with a different date range, score the new set, and then update our initial scores to reflect the average score, continuing this process subsequently.

In [None]:
def get_chunks(labeled_data):
    chunks = []

    for idx, traits in labeled_data.items():
        for selected_trait in ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]:
            if traits[selected_trait] == "high":
                chunks.append(filtered_searches[idx])
                break
    return chunks 

In [None]:
selected_chunks = get_chunks(labeled_data)
print(f"chunks: {len(selected_chunks)}")
selected_chunks

In [None]:
import os
os.environ['OPENAI_API_KEY'] = ""

In [None]:
# TBD may need an additional chunk strategy here

SCORE_TEMPLATE = """
Please assign a score between 0.0 and 1.0 to represent the level of each OCEAN trait in the following text. 
A score of 0 indicates the complete absence of the trait, while a score of 1 indicates the highest expression of the trait.

text: {chunks}

Output your answer as a single number without giving any explanation.

Format your response as a JSON object, using the trait names as keys and the assigned levels as values. 
    
    Expected JSON Output Format:
    {{
        "openness": "float_number",
        "conscientiousness": "float_number",
        "extraversion": "float_number",
        "agreeableness": "float_numer",
        "neuroticism": "float_number"
    }}
    
Ensure that the response strictly adheres to the JSON format specified.
"""

In [None]:
score_prompt = PromptTemplate(
    input_variables=["chunks"],
    template=SCORE_TEMPLATE,
)

chain = LLMChain(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo-16k"),
    prompt=score_prompt
)

In [None]:
result = chain.run({"chunks": selected_chunks})
result = extract_json(result)
result