In [1]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.13.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.19.0-py3-none-any.whl.metadata (85 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [1]:
import json
import os
import sys
import dotenv
import boto3
import requests

import pprint

import yaml
from pathlib import Path
from typing import Dict, List, Optional, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

# AWS authentication
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    send_message_to_chat
)

from src.models.persona_info import PersonaInfo, PersonaSkillsInterest
from src.models.generic_models import ListOfStrs

from src.models.interview_info import(
    InterviewInfo,
    InverviewQualityInfo,
    InterviewAgentMessage
)
from src.prompts.interview_prompt import(
    TRAINING_SKILLS_EXTENSION_INTERVIEW_PROMPT,
    TRAINING_SKILLS_INTERVIEW_QUALITY_CHECK_PROMPT
)
from src.prompts.persona_extraction_prompt import(
    PERSONA_SKILL_DOMAINS_CLASSIFICATION_PROMPT,
    PERSONA_SKILL_DOMAINS_CLASSIFICATION_PROMPT_ALT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("‚ùå No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("‚úÖ API key found, we're ready to roll")

‚úÖ API key found, we're ready to roll


In [2]:
MAX_TURNS_IN_INTERVIEW = 10

In [3]:
DATA_PERSONAS_INFO_DIR = Path('../data_personas_info')
DATA_JOBS_DIR = Path('../data_jobs')
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_INTERVIEWS_DIR = Path('../data_interviews')

In [4]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

personas_info_data_version = config["personas_info_data_version"]
print(f"personas_info_data_version version: {personas_info_data_version}")

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

interview_data_version = config["interview_data_version"]
print(f"interview_data_version version: {interview_data_version}")

personas_info_data_version version: v14
job_data_version version: v4
training_data_version version: v7
interview_data_version version: v8


In [5]:
filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
jobs_map = read_json(save_path)

In [6]:
# Load skills domains data
filename = f"final_map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)

print(f"‚úÖ Loaded {len(trainings_map)} skills domains")
print("\n" + "="*50)

‚úÖ Loaded 12 skills domains



In [7]:
# Load Personas data
filename = f"final_with_jobs_trainings_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename

initial_personas_data = read_json(personas_save_path)

# Convert to PersonaInfo objects
personas = {
    pid: PersonaInfo.model_validate_json(data)
    for pid, data in initial_personas_data.items()
}

print(f"‚úÖ Loaded {len(personas)} personas")
print("\n" + "="*50)

‚úÖ Loaded 100 personas



In [8]:
# Load skills domains data
filename = f"final_map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)

trainings_map_lower = {key.lower(): value for key, value in trainings_map.items()}

print(f"‚úÖ Loaded {len(trainings_map)} skills domains")
print("\n" + "="*50)

‚úÖ Loaded 12 skills domains



In [9]:
# Load initial training extension interview
filename = f"training_domain_extension_interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
initial_interviews = read_json(interviews_save_path)

# Extract skill domains

In [50]:
def extract_skill_domain_info(
    formatted_domains,
    interview,
    model: str = "mistral-small-latest",
    print_prompt: bool = False
) -> PersonaSkillsInterest:

    text = '\n'.join(interview)
    
    prompt = PERSONA_SKILL_DOMAINS_CLASSIFICATION_PROMPT.format(
        formatted_domains=formatted_domains,
        conversation=text
    )

    if print_prompt is True:
        print(prompt)
    
    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=PersonaSkillsInterest, prompt=prompt)

    if hasattr(extraction_agent, 'last_response'):
        track_api_call(extraction_agent.last_response, model)

    return result

In [51]:
if False:
    person_id = 'persona_001'
    
    domains_str = ""
    for domain in trainings_map:
        domains_str += f"- {domain}" + "\n"

    conversation_id = initial_interviews[person_id]['conversation_id']
    initial_interview = initial_interviews[person_id]['interview']
    #print(conversation_id)
    #print(initial_interview)
    #print(domains_str)

    result = extract_skill_domain_info(
        domains_str,
        initial_interview,
        print_prompt=True
    )

    print(result)
    print(result.list_of_strs)
    

In [52]:
filename = f"training_domain_classified_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
if not personas_save_path.exists():
    save_json(personas_save_path, {})
personas_data = read_json(personas_save_path)

cache_period = 5

new_personas_processed = 0
for person_id in tqdm(initial_personas_data):
    initial_persona_data_dict = json.loads(initial_personas_data[person_id])
    if initial_persona_data_dict['recommendation_type'] != 'trainings_only':
        personas_data[person_id] = initial_personas_data[person_id]
        continue
        
    if person_id in personas_data:
        continue

    if person_id not in initial_interviews:
        personas_data[person_id] = initial_personas_data[person_id]
        continue

    #print(person_id)
    new_personas_processed += 1

    domains_str = ""
    for domain in trainings_map:
        domains_str += f"- {domain}" + "\n"

    conversation_id = initial_interviews[person_id]['conversation_id']
    initial_interview = initial_interviews[person_id]['interview']

    result = extract_skill_domain_info(
        domains_str,
        initial_interview,
        model="mistral-medium-latest",
        print_prompt=False
    )

    persona_data_dict = json.loads(initial_personas_data[person_id])
    if result.interested_by_training is False:
        print(person_id)
        print('awareness')
        persona_data_dict['recommendation_type'] = 'awareness'
    else:
        persona_data_dict['skills_domains'] = result.list_of_skills

    personas_data[person_id] = json.dumps(persona_data_dict, ensure_ascii=False)

    if new_personas_processed % cache_period == 0:
        save_json(personas_save_path, personas_data)

    #if new_personas_processed > 0:break

save_json(personas_save_path, personas_data)



 30%|‚ñà‚ñà‚ñà       | 30/100 [00:21<00:43,  1.61it/s]

persona_030
awareness


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 62/100 [00:54<00:34,  1.10it/s]

persona_062
awareness


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 86/100 [00:59<00:04,  3.29it/s]

persona_086
awareness


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:09<00:00,  1.44it/s]


---

# For Debug Only

---

In [15]:
filename = f"training_domain_classified_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

cache_period = 5

new_personas_processed = 0
for person_id in tqdm(personas_data):
    persona_data_dict = json.loads(personas_data[person_id])
    if persona_data_dict['recommendation_type'] == 'trainings_only':
        skill_domains = persona_data_dict['skills_domains']
        for domain in skill_domains:
            if domain.lower() not in trainings_map_lower:
                print(f"{person_id} - {domain} not in trainings_map")
                continue


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 79754.78it/s]

FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT
FIT





# Redo extract skill domain for persona

In [40]:
if True:
    person_id = 'persona_036'

    filename = f"training_domain_classified_personas_info_{personas_info_data_version}.json"
    personas_save_path = DATA_PERSONAS_INFO_DIR / filename
    personas_data = read_json(personas_save_path)

    initial_persona_data_dict = json.loads(initial_personas_data[person_id])
    print(initial_persona_data_dict)
    if initial_persona_data_dict['recommendation_type'] != 'trainings_only':
        print("Not training only")
        personas_data[person_id] = initial_personas_data[person_id]
        print(personas_data[person_id])
        save_json(personas_save_path, personas_data)
    elif person_id not in initial_interviews:
        print("Interview missing")
        #persona_data_dict = json.loads(initial_personas_data[person_id])
        #persona_data_dict['recommendation_type'] = 'awareness'
        #save_json(personas_save_path, personas_data)
    else:
        print("processing")
        domains_str = ""
        for domain in trainings_map:
            domains_str += f"- {domain}" + "\n"
    
        conversation_id = initial_interviews[person_id]['conversation_id']
        initial_interview = initial_interviews[person_id]['interview']
    
        result = extract_skill_domain_info(
            domains_str,
            initial_interview,
            model="mistral-medium-latest",
            print_prompt=True
        )
        print(result)

        persona_data_dict = json.loads(initial_personas_data[person_id])
        if result.interested_by_training is False:
            persona_data_dict['recommendation_type'] = 'awareness'
        else:
            persona_data_dict['skills_domains'] = result.list_of_skills
            #persona_data_dict['skills_domains'] = ['Financial Risk Management And Compliance']

        print(persona_data_dict)
        
        personas_data[person_id] = json.dumps(persona_data_dict, ensure_ascii=False)
                
        save_json(personas_save_path, personas_data)

{'name': '', 'age': 16, 'location': 'Recife', 'recommendation_type': 'trainings_only', 'open_to_relocate_for_work': False, 'work_type_preference': '', 'target_domains': [], 'education_level': '', 'years_of_experience': 0, 'skills_domains': [], 'skills': {}, 'languages': [], 'goals': 'Understand career-related information, specifically about skills and what employers look for in the insurance industry.', 'hard_filtered_jobs_ids': [], 'proposed_job_ids': []}
processing

You are an expert in skill taxonomy, training analysis and candidate profile description analysis.

Your task is to evaluate whether the candidate is interesting by any of the existing skill domains based on the interview conversation.

# Existing Skill Domains:
- Financial Risk Management And Compliance
- Electrical And Electronic Systems Engineering
- Food Safety And Management
- Fiber And Paper Industry Operations
- Industrial Equipment Maintenance And Optimization
- Procurement And Supply Chain Management
- Hospitalit

# Skills interview

In [30]:
filename = f"training_domain_classified_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

In [11]:
def conduct_persona_interview(
    persona_id: str,
    skills_str: str,
    confirm_age: bool = False,
    conversation_id: str = None,
    max_turns: int = 5,
    model: str = "mistral-medium-latest",
    print_conversation: bool = False
) -> List[str]:
    """Interview a persona and return conversation transcript"""

    interview = InterviewInfo()
    
    # prompt = INITIAL_INTERVIEW_PROMPT
    prompt = TRAINING_SKILLS_EXTENSION_INTERVIEW_PROMPT
    
    interview_agent = get_agent(prompt, model_id=model)

    # Start with greeting
    agent_message = "From following list of skills, for which you are interested in and what is your current proficiency level (None, Basic, Intermediate, Advanced):\n"
    agent_message += skills_str
    agent_message += "If none of them or not interested by a training, just say it"

    #print(conversation_id)
    #print(prompt)
    #print(agent_message)
    #return None
    
    if print_conversation:
        print(f"Assistant: {agent_message}")
    interview.interview.append(f"Assistant: {agent_message}")

    # Conduct interview
    for turn in range(max_turns):
        resp = send_message_to_chat(agent_message, persona_id, conversation_id)

        if resp is None:
            break

        user_response, conversation_id = resp
        interview.conversation_id = conversation_id
        interview.interview.append(f"User: {user_response}")
        if print_conversation:
            print(f"User: {user_response}")

        # Generate next question
        conversation_str = '\n'.join(interview.interview)
        agent_response = interview_agent.structured_output(output_model=InterviewAgentMessage, prompt=conversation_str)
        # agent_response = interview_agent.structured_output(output_model=InterviewAgentMessage, user_response)

        # Track cost (using utils.py function)
        # track_api_call(agent_response, model)

        if agent_response.conversation_finished is True:
            break
            
        agent_message = agent_response.message
        interview.interview.append(f"Assistant: {agent_message}")
        if print_conversation:
            print(f"Assistant: {agent_message}")

    return interview

In [18]:
# Interview all personas
#persona_ids = [f'persona_{i:03}' for i in range(1, 4)]
persona_ids = [f'persona_{i:03}' for i in range(1, 101)]

# personas_save_path = SUBMISSION_DIR / 'personas.json'
filename = f"training_skills_extension_interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
if not interviews_save_path.exists():
    save_json(interviews_save_path, {})
interviews = read_json(interviews_save_path)

# Track how many new personas we process
new_personas_processed = 0

for persona_id in tqdm(initial_interviews):

    if persona_id in interviews:
        continue

    persona_info = personas[persona_id]

    conversation_id = initial_interviews[persona_id]['conversation_id']

    new_personas_processed += 1

    personas_data_dict = json.loads(personas_data[persona_id])
    skill_domains = personas_data_dict['skills_domains']
    
    skills_str = ""
    for domain in skill_domains:
        for skill in trainings_map[domain]:
            skills_str += f"- {domain} : {skill}" + "\n"

    # Interview
    conversation = conduct_persona_interview(
        persona_id,
        skills_str,
        conversation_id=conversation_id,
        max_turns=6,
        print_conversation=False)
    interviews[persona_id] = conversation.model_dump()

    # Save every interview
    if len(interviews) % 1 == 0:
        save_json(interviews_save_path, interviews)

    # Show cost update every 20 personas
    # if new_personas_processed > 0 and new_personas_processed % 20 == 0:
    #     print(f"\nüí∞ Cost update after {new_personas_processed} new personas:")
    #     print_cost_summary()
    #     print()

    #if new_personas_processed > 4:break

save_json(interviews_save_path, interviews)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29/29 [00:09<00:00,  3.20it/s]


# Redo persona interview

In [19]:
filename = f"training_domain_classified_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

if True:
    persona_id = 'persona_025'

    #filename = f"training_skills_extension_interviews_{interview_data_version}.json"
    #interviews_save_path = DATA_INTERVIEWS_DIR / filename
    #interviews = read_json(interviews_save_path)
    #conversation_id = None

    filename = f"full_interviews_{persona_id}.json"
    interview_save_path = DATA_INTERVIEWS_DIR / filename
    interview = read_json(interview_save_path)
    conversation_id = interview['conversation_id']

    personas_data_dict = json.loads(personas_data[persona_id])
    skill_domains = personas_data_dict['skills_domains']

    confirm_age = False
    
    personas_data_dict

    #for domain in skill_domains:
    #    if domain.lower() not in trainings_map_lower:
    #        print(f"{person_id} - {domain} not in trainings_map")

    
    skills_str = ""
    for domain in skill_domains:
        #if domain.lower() not in trainings_map_lower:
        for skill in trainings_map_lower[domain.lower()]:
            skills_str += f"- {domain} : {skill}" + "\n"

    # Interview
    conversation = conduct_persona_interview(
        persona_id,
        skills_str,
        conversation_id=conversation_id,
        max_turns=6,
        print_conversation=True)
    
    new_interview = conversation.model_dump()
    interview['interview'].extend(new_interview['interview'])
    save_json(interview_save_path, interview)
    
    #interviews[persona_id] = conversation.model_dump()

    # Save every interview
    #save_json(interviews_save_path, interviews)


Assistant: From following list of skills, for which you are interested in and what is your current proficiency level (None, Basic, Intermediate, Advanced):
If none of them or not interested by a training, just say it
User: I don‚Äôt know any of these yet, but I‚Äôd love to learn about machine operation or basic maintenance‚Äîmy level is none, but I‚Äôm really curious!


  agent_response = interview_agent.structured_output(output_model=InterviewAgentMessage, prompt=conversation_str)


# Check interview qualities

In [10]:
def check_interview_quality(
    interview: str,
    model: str = "mistral-small-latest",
    print_prompt: bool = False
) -> InverviewQualityInfo:
    prompt = TRAINING_SKILLS_INTERVIEW_QUALITY_CHECK_PROMPT.format(
        interview=interview
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=InverviewQualityInfo, prompt=prompt)

    return result

In [11]:
filename = f"training_skills_extension_interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
interviews = read_json(interviews_save_path)

In [12]:
cache_period = 5

filename = f"quality_trainings_skills_interviews_{interview_data_version}.json"
save_path = DATA_INTERVIEWS_DIR / filename
if not save_path.exists():
    save_json(save_path, {})
quality_interviews = read_json(save_path)

new_items_processed = 0
for persona_id in tqdm(initial_personas_data):
    persona_data_dict = json.loads(initial_personas_data[persona_id])

    if persona_data_dict['recommendation_type'] != 'trainings_only':
        continue

    if persona_id in quality_interviews:
        quality = json.loads(quality_interviews[persona_id])
        if quality['quality_level'] == 'OK':
            continue

    if persona_id not in interviews:
        quality_data = {
            'quality_level': 'NOK',
            'rationale': 'interview missing'
        }
        quality = InverviewQualityInfo(**quality_data)
        quality_str = json.dumps(quality.model_dump(), ensure_ascii=False)
        quality_interviews[persona_id] = quality_str
        save_json(save_path, quality_interviews)
        continue
    
    new_items_processed = new_items_processed + 1

    interview = interviews[persona_id]['interview']

    interview_str = "\n".join(interview)
    # print(interview_str)

    quality = check_interview_quality(
        interview_str,
        print_prompt=False)
    quality_str = json.dumps(quality.model_dump(), ensure_ascii=False)

    quality_interviews[persona_id] = quality_str

    if new_items_processed % cache_period == 0:
        save_json(save_path, quality_interviews)

    # if new_items_processed > 0:break

save_json(save_path, quality_interviews)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:23<00:00,  4.18it/s]


In [13]:
filename = f"quality_trainings_skills_interviews_{interview_data_version}.json"
save_path = DATA_INTERVIEWS_DIR / filename
quality_interviews = read_json(save_path)

for persona_id in quality_interviews:
    quality = json.loads(quality_interviews[persona_id])
    if quality['quality_level'] != 'OK':
        print(persona_id)
        print(quality['rationale'])
        print('---')

# Redo interview of a Persona

In [12]:
persona_id = "persona_098"

# Interview
print("üé§ Conduct Interview...")
conversation = conduct_persona_interview(persona_id, max_turns=10, print_conversation=True)
print(conversation)

üé§ Conduct Interview...


NameError: name 'conduct_persona_interview' is not defined

In [11]:
filename = f"job_extension_interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
interviews = read_json(interviews_save_path)
interviews[persona_id] = conversation.model_dump()
save_json(interviews_save_path, interviews)

NameError: name 'conversation' is not defined

# Translate interviews in english

In [None]:
def translate_interview(
    interview,
    model: str = "mistral-small-latest",
    print_prompt=False
) -> InterviewInfo:

    prompt = TRANSLATE_INTERVIEW_PROMPT.format(
        interview=interview
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=InterviewInfo, prompt=prompt)

    return result

In [None]:
filename = f"interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
interviews = read_json(interviews_save_path)

In [None]:
cache_period = 5

filename = f"en_interviews_{interview_data_version}.json"
save_path = DATA_INTERVIEWS_DIR / filename
if not save_path.exists():
    save_json(save_path, {})
en_interviews = read_json(save_path)


new_items_processed = 0
for interview_id in tqdm(interviews):
    if interview_id in en_interviews:
        continue

    new_items_processed = new_items_processed + 1
    
    interview = interviews[interview_id]

    translated_interview = translate_interview(interview, print_prompt=False)
    en_interviews[interview_id] = translated_interview.interview

    if new_items_processed % cache_period == 0:
        save_json(save_path, en_interviews)

    # if new_items_processed > 4:break

save_json(save_path, en_interviews)