In [69]:
from gradio_client import Client
import json
import json_repair
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

LLAMA2_7B_CLIENT = Client("huggingface-projects/llama-2-7b-chat")
MISTRAL_7B_CLIENT = Client("hysts/mistral-7b")


def ask_llama2_7b(prompt, 
                  system_prompt="",
                  max_new_tokens=1024, 
                  temperature=0.6,
                  top_p=0.9,
                  top_k=50,
                  repetition_penalty=1.2):
    """
    It generates a response using the Llama2 7B HuggingFace Space as API
    """
    response = LLAMA2_7B_CLIENT.predict(
        prompt, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty, api_name="/chat" 
    )
    return response


def ask_mistral_7b(
      prompt, 
      max_new_tokens=1024, 
      temperature=0.6,
      top_p=0.9,
      top_k=50,
      repetition_penalty=1.2):
    """
    It generates a response using the Mistral 7B HuggingFace unofficial space as API
    """
    response = MISTRAL_7B_CLIENT.predict(
        prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty, api_name="/chat"
    )
    return response


def extract_json(gpt_answer):
    text = gpt_answer.replace("\\n", "\n")
    start_index = text.find("{")
    end_index = text.rfind("}")

    json_text = text
    if start_index != -1 and end_index != -1 and start_index < end_index:
        json_text = text[start_index : end_index + 1]

    json_response = json_repair.loads(json_text)
    return json_response    

Loaded as API: https://huggingface-projects-llama-2-7b-chat.hf.space ✔
Loaded as API: https://hysts-mistral-7b.hf.space ✔


## Generated generic markers using GPT4

This data is expected to be generated only once. Not need to be dynamic

In [81]:
markers = {
    "openness": {
        "positive": "Searches and conversations about diverse topics, cultural interests, artistic activities, and philosophical discussions.",
        "negative": "Limited variety in search topics, avoidance of abstract or theoretical discussions in conversations."
    },
    "conscientiousness": {
        "positive": "Searches related to planning, organization, and goal-setting. Conversations about personal achievements, detailed planning, and adherence to schedules.",
        "negative": "Disorganized or last-minute search patterns, conversations indicating procrastination or disinterest in planning."
    },
    "extraversion": {
        "positive": "Online engagement with social events, searches about networking opportunities. Conversations that are energetic, involve many social topics, or planning social events.",
        "negative": "Limited searches about social activities, conversations that are short, infrequent, or reveal a preference for solitude."
    },
    "agreeableness": {
        "positive": "Searches about volunteering, social causes, or how to help others. Conversations that are empathetic, understanding, and cooperative.",
        "negative": "Searches indicating conflict, hostility, or self-centered interests. Conversations that are argumentative, lack empathy, or are overly competitive."
    },
    "neuroticism": {
        "positive": "Lack of excessive searches about worries, fears, or health anxieties. Balanced and calm nature in conversations.",
        "negative": "Frequent searches about health anxieties, fears, or negative outcomes. Conversations that are often worried, stressed, or pessimistic."
    },
}

## Mocktest traits classification for a single chunk

In [6]:
chunk_test_search_history = """
7:26,Searched for how to stay motivated big corp reddit
7:27,Searched for site:news.ycombinator.com large company
7:27,Visited Think before you join any large company. Nothing here is unique to ...
7:28,Visited Why do giant companies do so many hilariously dumb things? As ...
7:29,Visited Ask HN: Startup acquired by a large company and it sucks. What to ...
7:30,"Visited Ask HN: Moving from a startup to a big co, what should I be aware of ..."
7:35,"Visited Large company? Literally every company I ever worked for, the ..."
"""

In [36]:
prompt = f"""
Please analyze the given search history and classify the associated OCEAN personality traits \
(Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism) based on predefined markers. 

Assign each trait a level: high, medium, low, or none.

Search History:
{chunk_test_search_history}

Markers for OCEAN Traits:
{markers}

Your task is to evaluate the search history against these markers and classify the level of each \
OCEAN trait. 

Format your response as a JSON object, using the trait names as keys and the assigned levels as values. 

Expected JSON Output Format:
{{
    "openness": "high/medium/low/none",
    "conscientiousness": "high/medium/low/none",
    "extraversion": "high/medium/low/none",
    "agreeableness": "high/medium/low/none",
    "neuroticism": "high/medium/low/none"
}}

Note:
- Replace "high/medium/low/none" with the appropriate level based on your analysis.
- Ensure that the response strictly adheres to the JSON format specified.
"""

In [21]:
response = ask_mistral_7b(prompt=prompt)
response = extract_json(response)
response

{'openness': 'None',
 'conscientiousness': 'None',
 'extraversion': 'None',
 'agreeableness': 'None',
 'neuroticism': 'None'}

In [23]:
response = ask_llama2_7b(prompt=prompt)
response = extract_json(response)
response

{'openness': 'medium',
 'conscientiousness': 'medium',
 'extraversion': 'high',
 'agreeableness': 'high',
 'neuroticism': 'low'}

## Classify mocked data 

This is done by using a non-openAI based model

**Note: Models from HuggingFace can be shutted down at any time**

In [84]:
mocked_data_chunks = [
"""
7:26,Searched for how to stay motivated big corp reddit
7:27,Searched for site:news.ycombinator.com large company
7:27,Visited Think before you join any large company. Nothing here is unique to ...
7:28,Visited Why do giant companies do so many hilariously dumb things? As ...
7:29,Visited Ask HN: Startup acquired by a large company and it sucks. What to ...
7:30,"Visited Ask HN: Moving from a startup to a big co, what should I be aware of ..."
7:35,"Visited Large company? Literally every company I ever worked for, the ..."
""",
# high agreeableness and high neuroticism
"""
8:15, Searched for local charity events near me
8:17, Searched for how to deal with feeling overwhelmed at work
8:19, Visited forum post on tips for better understanding others' feelings
8:20, Read article on managing anxiety and stress in personal relationships
8:23, Searched for ways to help a friend in need
8:25, Visited blog on coping with fears of failure and rejection
""",
# high agreeableness and high neuroticism
"""
9:10, Searched for how to volunteer for local homeless shelters
9:12, Searched for articles on overcoming personal insecurities
9:14, Visited a page on empathetic communication skills
9:16, Read about techniques for managing social anxiety
9:18, Searched for upcoming community service events
9:20, Visited a support group forum for dealing with chronic worry
""",
# high consicientiousness, low everything else
"""
10:05, Searched for best time management tools for professionals
10:07, Searched for efficient filing systems for office use
10:09, Visited article on optimizing daily routines for productivity
10:11, Read tips on maintaining focus in a busy work environment
10:14, Searched for courses on advanced project management techniques
10:17, Visited blog on personal discipline and self-improvement in the workplace
"""
]

In [85]:
labeled_data = {}
for idx, chunk in enumerate(mocked_data_chunks):
    # update prompt
    prompt = f"""
    Please analyze the given search history and classify the associated OCEAN personality traits \
    (Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism) based on predefined markers. 
    
    Assign each trait a level: high, medium, low, or none.
    
    Search History:
    {chunk}
    
    Markers for OCEAN Traits:
    {markers}
    
    Your task is to evaluate the search history against these markers and classify the level of each \
    OCEAN trait. 
    
    Format your response as a JSON object, using the trait names as keys and the assigned levels as values. 
    
    Expected JSON Output Format:
    {{
        "openness": "high/medium/low/none",
        "conscientiousness": "high/medium/low/none",
        "extraversion": "high/medium/low/none",
        "agreeableness": "high/medium/low/none",
        "neuroticism": "high/medium/low/none"
    }}
    
    Note:
    - Replace "high/medium/low/none" with the appropriate level based on your analysis.
    - Ensure that the response strictly adheres to the JSON format specified.
    """
    response = ask_llama2_7b(prompt=prompt)
    response = extract_json(response)
    labeled_data[idx] = response    

In [86]:
labeled_data

{0: {'openness': 'high',
  'conscientiousness': 'medium',
  'extraversion': 'medium',
  'agreeableness': 'medium',
  'neuroticism': 'low'},
 1: {'openness': 'high',
  'conscientiousness': 'medium',
  'extraversion': 'medium',
  'agreeableness': 'medium',
  'neuroticism': 'low'},
 2: {'openness': 'high',
  'conscientiousness': 'medium',
  'extraversion': 'medium',
  'agreeableness': 'medium',
  'neuroticism': 'low'},
 3: {'openness': 'high',
  'conscientiousness': 'medium',
  'extraversion': 'medium',
  'agreeableness': 'medium',
  'neuroticism': 'low'}}

## Score according mocked data

In the case of search history, we want to score only **openness, neuroticism, and conscientiousness**. We want to do that using only the chunks where the trait was labeled as high

In [65]:
def get_chunks(labeled_data, selected_traits):
    chunks = {}
    # prepare dict 
    for selected_trait in selected_traits:
        chunks[selected_trait] = []

    # fill dict
    for idx, traits in labeled_data.items():
        for selected_trait in selected_traits:
            if traits[selected_trait] == "high":
                chunks[selected_trait].append(mocked_data_chunks[idx])
    return chunks 

In [87]:
selected_traits = ["openness", "neuroticism", "conscientiousness"]
labeled_data = {
    0: {
        "openness": "medium",
        "conscientiousness": "medium",
        "extraversion": "low",
        "agreeableness": "low",
        "neuroticism": "medium"
    },
    1: {
        "openness": "medium",
        "conscientiousness": "medium",
        "extraversion": "low",
        "agreeableness": "high",
        "neuroticism": "high"
    },
    2: {
        "openness": "medium",
        "conscientiousness": "medium",
        "extraversion": "low",
        "agreeableness": "high",
        "neuroticism": "high"
    },
    3: {
        "openness": "low",
        "conscientiousness": "high",
        "extraversion": "low",
        "agreeableness": "none",
        "neuroticism": "low"
    }
}

# Obtain a list of chunks per trait.
selected_chunks = get_chunks(labeled_data, selected_traits)
selected_chunks

{'openness': [],
 'neuroticism': ["\n8:15, Searched for local charity events near me\n8:17, Searched for how to deal with feeling overwhelmed at work\n8:19, Visited forum post on tips for better understanding others' feelings\n8:20, Read article on managing anxiety and stress in personal relationships\n8:23, Searched for ways to help a friend in need\n8:25, Visited blog on coping with fears of failure and rejection\n",
  '\n9:10, Searched for how to volunteer for local homeless shelters\n9:12, Searched for articles on overcoming personal insecurities\n9:14, Visited a page on empathetic communication skills\n9:16, Read about techniques for managing social anxiety\n9:18, Searched for upcoming community service events\n9:20, Visited a support group forum for dealing with chronic worry\n'],
 'conscientiousness': ['\n10:05, Searched for best time management tools for professionals\n10:07, Searched for efficient filing systems for office use\n10:09, Visited article on optimizing daily routin

In [89]:
import os
os.environ['OPENAI_API_KEY'] = ""


SCORE_TEMPLATE = """
Please assign a score between 0.0 and 1.0 to represent the level of the trait {trait} in the following text. 
A score of 0 indicates the complete absence of the trait, while a score of 1 indicates the highest expression of the trait.

Output your answer as a single number without giving any explanation.

Text: {chunk}
"""


score_prompt = PromptTemplate(
    input_variables=["trait", "chunk"],
    template=SCORE_TEMPLATE,
)

chain = LLMChain(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
    prompt=score_prompt
)


def score_trait(selected_chunks): 
    """
    If a trait does not have chunks (because none of them were labeled as high) we score the trait with a 0.
    Otherwise, we score the trait with the average score of its chunks.
    """
    scored_traits = {}
    for trait, chunks in selected_chunks.items():
        scores = []
        if not chunks: 
            score = 0
        else: 
            for chunk in chunks:
                score_instance = float(chain.run({"trait": trait, "chunk": chunk}))
                scores.append(score_instance)
                score = sum(scores)/len(scores) 
        scored_traits[trait] = {
            "n_chunks": len(chunks),
            "scores": scores,
            "average_score": score
        }
    return scored_traits 

In [90]:
score_output = score_trait(selected_chunks)

In [91]:
score_output

{'openness': {'n_chunks': 0, 'scores': [], 'average_score': 0},
 'neuroticism': {'n_chunks': 2, 'scores': [0.8, 0.6], 'average_score': 0.7},
 'conscientiousness': {'n_chunks': 1, 'scores': [0.8], 'average_score': 0.8}}

In [96]:
for trait, result in score_output.items():
    print(f"The trait:\033[1m {trait} \033[0m has a score of: \033[1m{result['average_score']}\033[0m")

The trait:[1m openness [0m has a score of: [1m0[0m
The trait:[1m neuroticism [0m has a score of: [1m0.7[0m
The trait:[1m conscientiousness [0m has a score of: [1m0.8[0m
