In [3]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.13.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.19.0-py3-none-any.whl.metadata (85 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [53]:
import json
import os
import sys
import dotenv
import boto3
import requests

import pprint

import yaml
from pathlib import Path
from typing import Dict, List, Optional, Tuple, TypeVar
from tqdm import tqdm

from collections import Counter

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

# AWS authentication
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    compute_stat_for_multi_items
)

from src.models.persona_info import PersonaInfo, PersonaSkills
from src.models.activity_domain_info import ActivityDomainInfo, ListOfActivityDomains
from src.models.skill_domain_info import SkillDomainInfo, ListSkillsDomains
from src.prompts.persona_extraction_prompt import (
    PERSONA_SKILLS_EXTRACTION_PROMPT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("‚ùå No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("‚úÖ API key found, we're ready to roll")

‚úÖ API key found, we're ready to roll


In [54]:
DATA_JOBS_DIR = Path('../data_jobs')
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_INTERVIEWS_DIR = Path('../data_interviews')
DATA_ACTIVITIES_DOMAINS_DIR = Path('../data_activities_domains')
DATA_SKILLS_DOMAINS_DIR = Path('../data_skills_domains')
DATA_PERSONAS_INFO_DIR = Path('../data_personas_info')

In [55]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

interview_data_version = config["interview_data_version"]
print(f"interview_data_version version: {interview_data_version}")

activity_domains_version = config["activity_domains_version"]
print(f"activity_domains_version version: {activity_domains_version}")

personas_info_data_version = config["personas_info_data_version"]
print(f"personas_info_data_version version: {personas_info_data_version}")

skill_domains_version = config["skill_domains_version"]
print(f"skill_domains_version version: {skill_domains_version}")

job_data_version version: v4
training_data_version version: v7
interview_data_version version: v8
activity_domains_version version: v4
personas_info_data_version version: v14
skill_domains_version version: v3


In [56]:
# Load jobs domains map data
filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
jobs_map = read_json(save_path)

In [57]:
# Load skills domains map data
filename = f"final_map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)
trainings_map_lower = {key.lower(): value for key, value in trainings_map.items()}

print(f"‚úÖ Loaded {len(trainings_map)} skills domains")
print("\n" + "="*50)

‚úÖ Loaded 12 skills domains



In [58]:
# Load interviews
filename = f"training_skills_extension_interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
initial_interviews = read_json(interviews_save_path)

In [59]:
# Load Personas data
filename = f"training_domain_classified_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename

initial_personas_data = read_json(personas_save_path)

# Convert to PersonaInfo objects
personas = {
    pid: PersonaInfo.model_validate_json(data)
    for pid, data in initial_personas_data.items()
}

print(f"‚úÖ Loaded {len(personas)} personas")
print("\n" + "="*50)

‚úÖ Loaded 100 personas



# Extract informations

In [60]:
def extract_training_persona_info(
    skills_str: str,
    conversation: List[str],
    model: str = "mistral-small-latest",
    print_prompt=False
) -> PersonaSkills:
    text = '\n'.join(conversation)
    #print(text)

    prompt = PERSONA_SKILLS_EXTRACTION_PROMPT.format(
        skills_list=skills_str,
        conversation=text
    )

    if print_prompt is True:
        print("\n" + "="*50)
        print(prompt)
        print("\n" + "="*50)

    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=PersonaSkills, prompt=prompt)

    return result

In [None]:
MAX_LOOPS = 1
cache_period = 5

# Prepare personas info
filename = f"last_final_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
if not personas_save_path.exists():
    save_json(personas_save_path, {})
personas_data = read_json(personas_save_path)

# personas_data = {}

new_items_processed = 0

print("START CLASSIFICATION LOOP")
for i in range(MAX_LOOPS):
    print("ITERATE CLASSIFICATION LOOP")

    new_personas_processed = 0

    for persona_id in tqdm(initial_personas_data):
        #print(persona_id)
        
        if persona_id in personas_data:
            initial_persona_data_dict = json.loads(initial_personas_data[persona_id])
            if initial_persona_data_dict['recommendation_type'] != 'trainings_only':
                personas_data[persona_id] = initial_personas_data[persona_id]
            continue

        if personas[persona_id].recommendation_type != 'trainings_only':
            personas_data[persona_id] = initial_personas_data[persona_id]
            continue

        if persona_id not in initial_interviews:
            personas_data[persona_id] = initial_personas_data[persona_id]
            continue

        new_personas_processed += 1

        conversation = initial_interviews[persona_id]['interview']

        personas_data_dict = json.loads(initial_personas_data[persona_id])
        skill_domains = personas_data_dict['skills_domains']

        if len(skill_domains) < 1:
            print(f"Persona {persona_id} has no skill domain")
            persona_data_dict = json.loads(initial_personas_data[persona_id])
            print(persona_data_dict)
            persona_data_dict['skills'] = {}
        else:
            skills_str = ""
            for domain in skill_domains:
                if domain.lower() not in trainings_map_lower:
                #if domain not in trainings_map:
                    print(f"{persona_id} - {domain} not in trainings_map")
                    continue
                for skill in trainings_map_lower[domain.lower()]:
                    skills_str += f"- {domain} : {skill}" + "\n"
    
            print_prompt = False
            if new_personas_processed == 1:
                print_prompt = True

            #if persona_id == 'persona_067':
            #    print_prompt = True
    
            result = extract_training_persona_info(
                skills_str,
                conversation,
                model="mistral-medium-latest",
                print_prompt=print_prompt
            )
    
            # if len(result.target_domains) == 0:
            #     print(f"Activity domains empty : for {persona_id}")
            #     continue
    
            # domain_issue = False
            # for domain in result.target_domains:
            #     if domain not in jobs_map:
            #         domain_issue = True
            #         print(f"{persona_id} : {result.target_domains} not in domains list")
    
            # if domain_issue is True:
            #     continue
    
            persona_data_dict = json.loads(initial_personas_data[persona_id])

            if result.interested_by_training is False:
                persona_data_dict['recommendation_type'] = 'awareness'
                print(persona_id)
                print('recommendation_type = awareness')
            else:            
                persona_data_dict['skills'] = result.skills
        
        personas_data[persona_id] = json.dumps(persona_data_dict, ensure_ascii=False)

        # Save every 5 personas
        if new_personas_processed % 5 == 0:
            save_json(personas_save_path, personas_data)

        # Show cost update every 20 personas
        # if new_personas_processed > 0 and new_personas_processed % 20 == 0:
        #     print(f"\nüí∞ Cost update after {new_personas_processed} new personas:")
        #     print_cost_summary()
        #     print()

        # if new_personas_processed > 0:break

save_json(personas_save_path, personas_data)

# Convert to PersonaInfo objects
# personas = {
#     pid: PersonaInfo.model_validate_json(data)
#     for pid, data in persona_infos.items()
# }

# print(f"\n‚úÖ Interviewed {len(personas)} personas total ({new_personas_processed} new)")


---

# For Debug Only

---

In [61]:
if False:
    persona_id = "persona_059"

    conversation = initial_interviews[persona_id]['interview']

    personas_data_dict = json.loads(initial_personas_data[persona_id])
    skill_domains = personas_data_dict['skills_domains']

    skills_str = ""
    for domain in skill_domains:
        for skill in trainings_map[domain]:
            skills_str += f"- {domain} : {skill}" + "\n"

    result = extract_training_persona_info(
        skills_str,
        conversation,
        model="mistral-medium-latest",
        print_prompt=True
    )

    print(result)

    #persona_data_dict = json.loads(initial_personas_data[persona_id])
    #persona_data_dict['open_to_relocate_for_work'] = result.open_to_relocate_for_work
    #persona_data_dict['work_type_preference'] = result.work_type_preference
    #persona_data_dict['education_level'] = result.education_level
    #persona_data_dict['years_of_experience'] = result.years_of_experience
    #persona_data_dict['languages'] = result.languages
    #persona_data_dict['goals'] = result.goals
    #persona_data_dict['target_domains'] = result.target_domains

    #print(persona_data_dict)

    # print(persona_info)

    # if(persona_info.recommendation_type != "awareness"):


# Redo Persona

In [42]:
if True:
    persona_id = 'persona_067'

    filename = f"last_final_personas_info_{personas_info_data_version}.json"
    personas_save_path = DATA_PERSONAS_INFO_DIR / filename
    personas_data = read_json(personas_save_path)

    if personas[persona_id].recommendation_type != 'trainings_only':
        print("Not classified as training only")
        personas_data[persona_id] = initial_personas_data[persona_id]
        save_json(personas_save_path, personas_data)
        print(personas_data[persona_id])
    elif persona_id not in initial_interviews:
        print("ISSUE")
    else:
        conversation = initial_interviews[persona_id]['interview']

        personas_data_dict = json.loads(initial_personas_data[persona_id])
        skill_domains = personas_data_dict['skills_domains']

        if len(skill_domains) < 1:
            print("Persona has no skill domain")
            persona_data_dict = json.loads(initial_personas_data[persona_id])
            print(persona_data_dict)
            persona_data_dict['skills'] = {}
        else:
            skills_str = ""
            for domain in skill_domains:
                if domain.lower() not in trainings_map_lower:
                #for skill in trainings_map[domain]:
                    for skill in trainings_map_lower[domain.lower()]:
                        skills_str += f"- {domain} : {skill}" + "\n"

            result = extract_training_persona_info(
                skills_str,
                conversation,
                model="mistral-medium-latest",
                print_prompt=True
            )

            print(result)

            persona_data_dict = json.loads(initial_personas_data[persona_id])
            if result.interested_by_training is False:
                persona_data_dict['recommendation_type'] = 'awareness'
            else:            
                persona_data_dict['skills'] = result.skills

        personas_data[persona_id] = json.dumps(persona_data_dict, ensure_ascii=False)
        print(persona_data_dict)
        
        save_json(personas_save_path, personas_data)




You are an advanced AI assistant specializing in skill extraction from text. Your task is to analyze a conversation and a corresponding list of skills to identify which skills a persona is interested in and their current proficiency level.

You will be given the following inputs:
- a list of required skills for a job application : [LIST OF AVAILABLE SKILLS]
- an interview conversation with a persona. : [CONVERSATION]

[LIST OF AVAILABLE SKILLS]:

[END OF LIST OF AVAILABLE SKILLS]

[CONVERSATION]
Assistant: From following list of skills, for which you are interested in and what is your current proficiency level (None, Basic, Intermediate, Advanced):
- Financial Risk Management And Compliance : Financial Software Proficiency
- Financial Risk Management And Compliance : Cost Analysis And Financial Evaluation
- Financial Risk Management And Compliance : Financial Data Analysis
- Financial Risk Management And Compliance : Financial Compliance Reporting
- Financial Risk Management And Comp

# Persona skills quality

In [13]:
filename = f"last_final_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
persona_infos = read_json(personas_save_path)

In [14]:
for persona_id in tqdm(persona_infos):
    if personas[persona_id].recommendation_type != 'trainings_only':
        continue

    persona_data_dict = json.loads(persona_infos[persona_id])

    if len(persona_data_dict['skills']) == 0:
        continue

    for domain_skill in persona_data_dict['skills']:
        level = persona_data_dict['skills'][domain_skill]
        parts = domain_skill.split(" : ")
        if len(parts) < 2:
            print(f"ERROR {persona_id} : format error '{parts}'")
            continue

        domain = parts[0]
        skill = parts[1]

        if domain not in trainings_map:
            print(f"ERROR {persona_id} : domain not knwon '{domain}'")
            continue

        if skill not in trainings_map[domain]:
            print(f"ERROR {persona_id} : skill not knwon '{skill}'")
            continue



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 84990.96it/s]


In [19]:
# Correct Persona Info Extraction
print("üíº Testing Persona information Extraction Agent...")
print("Reading a sample job file...\n")

filename = f"interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
interviews = read_json(interviews_save_path)

filename = f"personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

# Get first job file
persona_id = "persona_015"

if persona_id in interviews:
    conversation = interviews[persona_id]

    # Extract
    persona_info = extract_persona_info(activities_domains, skills_domains, conversation)
    personas_data[persona_id] = persona_info.model_dump_json()
    save_json(personas_save_path, personas_data)

    print(persona_info)
    #persona_infos[persona_id] = persona_info.model_dump_json()
    
    # persona_info = extract_persona_info(activities_domains, skills_domains, conversation)
   

üíº Testing Persona information Extraction Agent...
Reading a sample job file...

name='Camila' age=22 location='Fortaleza' recommendation_type='trainings_only' open_to_relocate_for_work=False work_type_preference='onsite' target_domains=['UNKNOWN'] education_level='T√©cnico' years_of_experience=0 skills_domains=['UNKNOWN'] skills={} languages={'Portuguese', 'English'} goals='Learn all basics of live event production: lights, sound, stage setup.'
