In [1]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.13.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.19.0-py3-none-any.whl.metadata (85 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [12]:
import json
import os
import sys
import dotenv
import boto3
import requests

import pprint

import yaml
from pathlib import Path
from typing import Dict, List, Optional, Tuple, TypeVar
from tqdm import tqdm

from collections import Counter

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

# AWS authentication
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    compute_stat_for_multi_items
)

from src.models.persona_info import PersonaInfo, InitialPersonaInfo
from src.models.activity_domain_info import ActivityDomainInfo, ListOfActivityDomains
from src.models.skill_domain_info import SkillDomainInfo, ListSkillsDomains
from src.prompts.persona_extraction_prompt import (
    PERSONA_EXTRACTION_PROMPT,
    PERSONA_INITIAL_EXTRACTION_PROMPT,
    PERSONA_EXTEND_SKILL_DOMAIN_PROMPT,
    PERSONA_SKILL_DOMAINS_CLASSIFICATION_PROMPT,
    PERSONA_ACTIVITY_DOMAINS_CLASSIFICATION_PROMPT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("‚ùå No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("‚úÖ API key found, we're ready to roll")

‚úÖ API key found, we're ready to roll


In [13]:
DATA_JOBS_DIR = Path('../data_jobs')
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_INTERVIEWS_DIR = Path('../data_interviews')
DATA_ACTIVITIES_DOMAINS_DIR = Path('../data_activities_domains')
DATA_SKILLS_DOMAINS_DIR = Path('../data_skills_domains')
DATA_PERSONAS_INFO_DIR = Path('../data_personas_info')

In [14]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

interview_data_version = config["interview_data_version"]
print(f"interview_data_version version: {interview_data_version}")

activity_domains_version = config["activity_domains_version"]
print(f"activity_domains_version version: {activity_domains_version}")

personas_info_data_version = config["personas_info_data_version"]
print(f"personas_info_data_version version: {personas_info_data_version}")

skill_domains_version = config["skill_domains_version"]
print(f"skill_domains_version version: {skill_domains_version}")

job_data_version version: v4
training_data_version version: v7
interview_data_version version: v8
activity_domains_version version: v4
personas_info_data_version version: v14
skill_domains_version version: v3


In [15]:
# Load jobs domains map data
filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
jobs_map = read_json(save_path)

In [16]:
# Load skills domains map data
filename = f"final_map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)

print(f"‚úÖ Loaded {len(trainings_map)} skills domains")
print("\n" + "="*50)

‚úÖ Loaded 12 skills domains



In [17]:
# Load interviews
filename = f"interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
interviews = read_json(interviews_save_path)

# Persona info parsing loop

In [18]:
def extract_initial_persona_info(
    conversation: List[str],
    model: str = "mistral-small-latest",
    print_prompt=False
) -> InitialPersonaInfo:
    """Extract persona info from conversation using Persona Extraction Agent"""
    text = '\n'.join(conversation)
    #print(text)

    prompt = PERSONA_INITIAL_EXTRACTION_PROMPT.format(
        conversation=text
    )

    if print_prompt is True:
        print(prompt)

    # return None

    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=InitialPersonaInfo, prompt=prompt)

    persona_info = PersonaInfo()
    persona_info.name = result.name
    persona_info.age = result.age
    persona_info.location = result.location
    persona_info.goals = result.goals
    
    persona_info.recommendation_type = 'awareness'
    if result.interested_by_training is True:
        persona_info.recommendation_type = 'trainings_only'
    if result.interested_by_job is True:
        persona_info.recommendation_type = 'jobs_trainings'
        
    # Track cost
    if hasattr(extraction_agent, 'last_response'):
        track_api_call(extraction_agent.last_response, model)

    return persona_info

In [9]:
MAX_LOOPS = 1
cache_period = 5

# Prepare personas info
filename = f"personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
if not personas_save_path.exists():
    save_json(personas_save_path, {})
personas_data = read_json(personas_save_path)

new_items_processed = 0

print("START CLASSIFICATION LOOP")
for i in range(MAX_LOOPS):
    print("ITERATE CLASSIFICATION LOOP")

    filt_personas_ids = []
    for persona_id in interviews:
        if persona_id not in personas_data:
            filt_personas_ids.append(persona_id)

    print(f'Personas to process: {len(filt_personas_ids)}')

    if len(filt_personas_ids) == 0:
        break

    # Reset cost tracker if starting fresh
    # if len(persona_infos) == 0:
    #     reset_cost_tracker()
    #     print("üí∞ Starting fresh - cost tracker reset")

    # Track how many new personas we process
    new_personas_processed = 0

    for persona_id in tqdm(filt_personas_ids):
        # Load Interview
        if persona_id in interviews:
            new_personas_processed += 1
            conversation = interviews[persona_id]['interview']

            print_prompt = False
            if new_personas_processed == 1:
               print_prompt = True

            # Extract initial persona informations
            persona_info = extract_initial_persona_info(
                conversation,
                model = "mistral-small-latest",
                print_prompt=print_prompt
                )
            personas_data[persona_id] = persona_info.model_dump_json()

            # Save every 5 personas
            if new_personas_processed % 5 == 0:
                save_json(personas_save_path, personas_data)

            # Show cost update every 20 personas
            # if new_personas_processed > 0 and new_personas_processed % 20 == 0:
            #     print(f"\nüí∞ Cost update after {new_personas_processed} new personas:")
            #     print_cost_summary()
            #     print()

        # if new_personas_processed > 1:break

save_json(personas_save_path, personas_data)

# Convert to PersonaInfo objects
# personas = {
#     pid: PersonaInfo.model_validate_json(data)
#     for pid, data in persona_infos.items()
# }

# print(f"\n‚úÖ Interviewed {len(personas)} personas total ({new_personas_processed} new)")


START CLASSIFICATION LOOP
ITERATE CLASSIFICATION LOOP
Personas to process: 100


  0%|          | 0/100 [00:00<?, ?it/s]

From the interview conversation, extract the following fields and return a structured profile with these fields:
- Age: The person's age as a number.
    - if age is not given but you know user has below 16, set age = 10
    - if age is not given but you know user has at least 16 or above, set age = 20
- Location: The city where the person lives
- Interested by job: True if the person expresses interest in employment, even if they are undecided or considering other options.
- Interested by training: True if Person is interested to find a training. Even if not completely sure
- Goals: Their stated career or learning objectives.
    - Make sure to capture all meaningful informations that will help understand acticity domain or skills domain in relatiob to candidate profile or goal.

If the person is exploring career paths like comparing job vs training, consider this as interest in job and training

For any information not explicitly mentioned:
- If they're just seeking information rathe

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [03:39<00:00,  2.20s/it]


# Patch personas with age under 16

In [10]:
# Load Personas data
filename = f"personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
initial_personas_data = read_json(personas_save_path)

In [11]:
filename = f"aging_filter_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
save_json(personas_save_path, {})
personas_data = read_json(personas_save_path)

for persona_id in initial_personas_data:
    personas_data_dict = json.loads(initial_personas_data[persona_id])
    if personas_data_dict['age'] < 16:
        personas_data_dict['recommendation_type'] = 'awareness'

    personas_data[persona_id] = json.dumps(personas_data_dict, ensure_ascii=False)

save_json(personas_save_path, personas_data)

---

# For Debug Only

---

# Redo persona

In [21]:
persona_id = 'persona_062'
conversation = interviews[persona_id]['interview']

# Extract initial persona informations
persona_info = extract_initial_persona_info(
    conversation,
    model = "mistral-small-latest",
    print_prompt=True
    )

print(persona_info)
# personas_data[persona_id] = persona_info.model_dump_json()

From the interview conversation, extract the following fields and return a structured profile with these fields:
- Age: The person's age as a number.
    - if age is not given but you know user has below 16, set age = 10
    - if age is not given but you know user has at least 16 or above, set age = 20
- Location: The city where the person lives
- Interested by job: True if the person expresses interest in employment, even if they are undecided or considering other options.
- Interested by training: True if Person is interested to find a training. Even if not completely sure
- Goals: Their stated career or learning objectives.
    - Make sure to capture all meaningful informations that will help understand acticity domain or skills domain in relatiob to candidate profile or goal.

If the person is exploring career paths like comparing job vs training, consider this as interest in job and training

For any information not explicitly mentioned:
- If they're just seeking information rathe

In [9]:
filename = f"personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)
personas_data[persona_id] = persona_info.model_dump_json()
save_json(personas_save_path, personas_data)

# Debug

In [None]:
if False:
    filename = f"interviews_{interview_data_version}.json"
    interviews_save_path = DATA_INTERVIEWS_DIR / filename
    interviews = read_json(interviews_save_path)

    persona_id = "persona_011"

    conversation = interviews[persona_id]['interview']

    print(conversation)

    persona_info = extract_initial_persona_info(
        conversation,
        model="mistral-small-latest",
        print_prompt=True
    )

    print(persona_info)

    # if(persona_info.recommendation_type != "awareness"):


In [None]:
if False:
    filename = f"interviews_{interview_data_version}.json"
    interviews_save_path = DATA_INTERVIEWS_DIR / filename
    interviews = read_json(interviews_save_path)

    persona_id = "persona_011"

    conversation = interviews[persona_id]['interview']

    print(conversation)

    persona_info = extract_initial_persona_info(
        conversation,
        model="mistral-small-latest",
        print_prompt=True
    )

    print(persona_info)

    # if(persona_info.recommendation_type != "awareness"):


# Persona info statistics

In [28]:
filename = f"aging_filter_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
persona_infos = read_json(personas_save_path)

In [29]:
# 1. Extract ages from the JSON data
ages = []
for persona in persona_infos.values():
    data = json.loads(persona)
    if data['age'] > 0:  # Filter out invalid age (0)
        ages.append(data['age'])
age_counts = Counter(ages)
# Print in ascending order of age
for age in sorted(age_counts.keys()):
    print(f"Age {age}: {age_counts[age]} occurrences")

# Extract infos from the dictionary
recommendation_types = []
locations = []
for persona in persona_infos.values():
    data = json.loads(persona)
    recommendation_types.append(data['recommendation_type'])
    locations.append(data['location'])

type_counts = Counter(recommendation_types)
print("\nRecommendation Type Counts:")
for type_name, count in type_counts.most_common():
    print(f"{type_name}: {count}")
print(f"\nTotal count: {sum(type_counts.values())}")

type_counts = Counter(locations)
print("\nLocation Counts:")
for type_name, count in type_counts.most_common():
    print(f"{type_name}: {count}")
print(f"\nTotal count: {sum(type_counts.values())}")

Age 13: 2 occurrences
Age 14: 1 occurrences
Age 15: 5 occurrences
Age 16: 10 occurrences
Age 17: 3 occurrences
Age 18: 12 occurrences
Age 19: 7 occurrences
Age 20: 3 occurrences
Age 21: 7 occurrences
Age 22: 7 occurrences
Age 23: 4 occurrences
Age 24: 4 occurrences
Age 25: 6 occurrences
Age 26: 8 occurrences
Age 27: 9 occurrences
Age 28: 7 occurrences
Age 30: 2 occurrences

Recommendation Type Counts:
jobs_trainings: 61
trainings_only: 21
awareness: 18

Total count: 100

Location Counts:
S√£o Paulo: 12
Belo Horizonte: 12
Recife: 11
Bras√≠lia: 11
Curitiba: 10
Salvador: 8
Porto Alegre: 8
Rio: 6
: 6
Rio de Janeiro: 6
Fortaleza: 4
Brazil: 3
Fortaleza, Brazil: 1
Recife, Brazil: 1
Salvador, Brazil: 1

Total count: 100


In [19]:
# Correct Persona Info Extraction
print("üíº Testing Persona information Extraction Agent...")
print("Reading a sample job file...\n")

filename = f"interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
interviews = read_json(interviews_save_path)

filename = f"personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

# Get first job file
persona_id = "persona_015"

if persona_id in interviews:
    conversation = interviews[persona_id]

    # Extract
    persona_info = extract_persona_info(activities_domains, skills_domains, conversation)
    personas_data[persona_id] = persona_info.model_dump_json()
    save_json(personas_save_path, personas_data)

    print(persona_info)
    #persona_infos[persona_id] = persona_info.model_dump_json()
    
    # persona_info = extract_persona_info(activities_domains, skills_domains, conversation)
   

üíº Testing Persona information Extraction Agent...
Reading a sample job file...

name='Camila' age=22 location='Fortaleza' recommendation_type='trainings_only' open_to_relocate_for_work=False work_type_preference='onsite' target_domains=['UNKNOWN'] education_level='T√©cnico' years_of_experience=0 skills_domains=['UNKNOWN'] skills={} languages={'Portuguese', 'English'} goals='Learn all basics of live event production: lights, sound, stage setup.'
