In [1]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.13.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.19.0-py3-none-any.whl.metadata (85 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [36]:
import json
import os
import sys
import dotenv
import boto3
import requests
from collections import Counter
import yaml

from datetime import datetime

import pprint

import yaml
from pathlib import Path
from typing import Dict, List, Optional, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

# AWS authentication
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
    chat_with_persona,
    validate_submission_format,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract
)

from src.models.persona_info import PersonaInfo
from src.models.job_info import JobInfo
from src.models.training_info import TrainingInfo
from src.models.generic_models import (
    BooleanModel,
    BooleanModelWithRationale,
    ListOfIds,
    ListOfStrs
)

from src.prompts.find_training_matches_prompt import (
    FIND_TRAINING_MATCHES_PROMPT,
    FIND_TRAINING_MATCHES_PROMPT_BY_NAME,
    CHECK_PERSONA_TRAINING_MATCH,
    FIND_TRAINING_MATCHES_FOR_JOB_PROMPT
)

from src.prompts.find_job_matches_prompt import (
    FIND_JOB_MATCHES_PROMPT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("❌ No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("✅ API key found, we're ready to roll")

✅ API key found, we're ready to roll


In [37]:
output_prefix = 'job_filtered_personas_info'

In [38]:
DATA_PERSONAS_INFO_DIR = Path('../data_personas_info')
DATA_JOBS_DIR = Path('../data_jobs')
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_SKILLS_DOMAINS_DIR = Path('../data_skills_domains')
DATA_MATCH_JOBS_TRAININGS_DIR = Path('../data_match_jobs_trainings')

In [39]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

personas_info_data_version = config["personas_info_data_version"]
print(f"personas_info_data_version version: {personas_info_data_version}")

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

skill_domains_version = config["skill_domains_version"]
print(f"skill_domains_version version: {skill_domains_version}")

match_skills_domains_trainings_data_version = f"{skill_domains_version}_{training_data_version}"
match_jobs_trainings_data_version = f"{job_data_version}_{training_data_version}"

personas_info_data_version version: v14
job_data_version version: v4
training_data_version version: v7
skill_domains_version version: v3


In [40]:
SUBMISSION_DIR = Path('../submissions')

In [41]:
# Load Jobs data
filename = f"final_jobs_{job_data_version}.json"
#filename = f"final_jobs_{job_data_version}.json"
jobs_save_path = DATA_JOBS_DIR / filename

jobs_data = read_json(jobs_save_path)

# Convert to JobInfo objects
jobs_info = {
    job_id: JobInfo.model_validate_json(data)
    for job_id, data in jobs_data.items()
}

print(f"✅ Loaded {len(jobs_info)} jobs")
print("\n" + "="*50)

✅ Loaded 200 jobs



In [42]:
# Load Trainings data
filename = f"extended_trainings_{training_data_version}.json"
trainings_save_path = DATA_TRAININGS_DIR / filename

trainings_data = read_json(trainings_save_path)

# Convert to TrainingInfo objects
trainings_info = {
    training_id: TrainingInfo.model_validate_json(data)
    for training_id, data in trainings_data.items()
}

print(f"✅ Loaded {len(trainings_info)} trainings")
print("\n" + "="*50)

✅ Loaded 497 trainings



In [43]:
# Load Personas data
filename = f"final_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename

initial_personas_data = read_json(personas_save_path)

# Convert to PersonaInfo objects
personas = {
    pid: PersonaInfo.model_validate_json(data)
    for pid, data in initial_personas_data.items()
}

print(f"✅ Loaded {len(personas)} personas")
print("\n" + "="*50)

✅ Loaded 99 personas



In [44]:
# Load jobs domains map data
filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
jobs_map = read_json(save_path)

# print(jobs_map)

# Compute Proposed Jobs

In [45]:
def get_jobs_by_target_domains(persona_target_domains, jobs_map):
    jobs_ids = []

    for domain in persona_target_domains:
        for job_id in jobs_map[domain]['job_ids']:
            jobs_ids.append(job_id)
    return jobs_ids

In [47]:
def hard_filter_jobs(persona_info, job_ids, jobs_info, verbose=False):
    filtered_job_ids = []

    for job_id in job_ids:
        if verbose is True:
            print(job_id)
        job_info = jobs_info[job_id]

        # if persona_info.work_type_preference == 'onsite' and job_info.work_type == 'remote':
        #     if verbose is True:
        #         print(f"excluded because of work_type : {job_info.work_type} - {persona_info.work_type_preference}")
        #     continue
            
        # if persona_info.work_type_preference == 'remote' and job_info.work_type == 'onsite':
        #     if verbose is True:
        #         print(f"excluded because of work_type : {job_info.work_type} - {persona_info.work_type_preference}")
        #     continue

        if job_info.work_type == 'onsite':
            if persona_info.open_to_relocate_for_work is False and job_info.location != persona_info.location:
                if verbose is True:
                    print(f"excluded because of location : {job_info.location} - {persona_info.location}")
                continue

        if verbose is True:
            print("Location OK")

        job_education_level = job_info.get_education_level_value()
        if job_education_level == -1:
            print(f"ERROR : job_education_level not recognized : {job_info.education_level_required}")

        persona_education_level = persona_info.get_education_level_value()
        if persona_education_level == -1:
            print(f"ERROR : persona_education_level not recognized : {persona_info.education_level}")

        if job_education_level > persona_education_level:
            if verbose is True:
                print(f"excluded because of education level : {job_education_level} - {persona_education_level}")
            continue

        if job_info.years_of_experience_required > persona_info.years_of_experience:
            if verbose is True:
                print(f"excluded because of experience : {job_info.years_of_experience_required} - {persona_info.years_of_experience}")
            continue

        is_language_match = False
        for job_language in job_info.required_languages:
            for persona_language in persona_info.languages:
                if job_language == persona_language:
                    is_language_match = True
        if is_language_match is False:
            if verbose is True:
                print("excluded because of language")
            continue

        filtered_job_ids.append(job_id)
        
    return filtered_job_ids

In [49]:
def review_job_matches(
    persona_info: PersonaInfo,
    jobs_text: str,  # Pre-built context to avoid rebuilding
    model: str = "mistral-medium-latest",
    print_prompt=False
) -> ListOfIds:
    """Find suitable jobs for a persona using semantic matching"""

    prompt = FIND_JOB_MATCHES_PROMPT.format(
            candidate_profile=persona_info.goals,
            jobs=jobs_text
        )

    if print_prompt is True:
        print(prompt)

    # return []
    agent = get_agent(model_id=model, temperature=0.0)
    response = agent.structured_output(output_model=ListOfIds, prompt=prompt)

    # print(response)
    # Track cost
    # track_api_call(response, model)

    return response

In [50]:
def matching_agent_process_jobs(persona_id, verbose=False):
    ###
    # Get jobs matching persona targeted activity domains
    ###
    filtered_jobs_ids = get_jobs_by_target_domains(personas[persona_id].target_domains, jobs_map)
    if verbose is True:
        print(f"filtered_jobs_ids : {filtered_jobs_ids}")

    ###
    # Apply hard filters 
    ###
    hard_filtered_jobs_ids = hard_filter_jobs(personas[persona_id], filtered_jobs_ids, jobs_info, verbose=verbose)
    if verbose is True:
        print(f"hard_filtered_jobs_ids : {hard_filtered_jobs_ids}")

    if len(hard_filtered_jobs_ids) == 0:
        # no jobs remaining
        selected_jobs_ids = []
        rationale = ''
    else:    
        ###
        # Review job list according to persona goal
        ###
        jobs_text = ""
        for job_id in hard_filtered_jobs_ids:
            jobs_text += jobs_info[job_id].get_info_for_matching(job_id) + "\n\n"
    
        result = review_job_matches(personas[persona_id], jobs_text, print_prompt=verbose)
        selected_jobs_ids = result.list_of_ids
        rationale = result.rationale
    
    if verbose is True:
        print(f"selected_jobs_ids : {selected_jobs_ids}")
    
    return hard_filtered_jobs_ids, selected_jobs_ids, rationale

In [52]:
cache_period = 5

# Prepare personas info
filename = f"{output_prefix}_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
if not personas_save_path.exists():
    save_json(personas_save_path, {})
personas_data = read_json(personas_save_path)

new_personas_processed = 0
for person_id in tqdm(personas):
    # print(person_id)
    new_personas_processed += 1
    persona = personas[person_id]
    personas_data[person_id] = initial_personas_data[person_id]
    persona_data = json.loads(initial_personas_data[person_id])

    if persona.recommendation_type == "jobs_trainings":
        if 'proposed_job_ids' not in persona_data:
            length = 0
        else:
            length = len(persona_data['proposed_job_ids'])
        if length == 0:
            #print(f"persona {person_id} without job proposed")
            hard_filtered_jobs_ids, selected_jobs_ids, rationale = matching_agent_process_jobs(person_id, verbose=False)
            persona_data['hard_filtered_jobs_ids'] = hard_filtered_jobs_ids
            persona_data['proposed_job_ids'] = selected_jobs_ids
            personas_data[person_id] = json.dumps(persona_data)

    if new_personas_processed % 5 == 0:
        save_json(personas_save_path, personas_data)
            # break

    # if new_personas_processed > 2:break
    
save_json(personas_save_path, personas_data)

100%|██████████| 99/99 [04:18<00:00,  2.61s/it]


---

# For Debug Only

---

In [None]:
if False:
    persona_id = 'persona_010'
    filtered_jobs_ids = get_jobs_by_target_domains(personas[persona_id].target_domains, jobs_map)
    print(filtered_jobs_ids)

In [None]:
if False:
    persona_id = 'persona_010'
    filtered_jobs_ids = get_jobs_by_target_domains(personas[persona_id].target_domains, jobs_map)
    print(filtered_jobs_ids)
    hard_filtered_jobs_ids = hard_filter_jobs(personas[persona_id], filtered_jobs_ids, jobs_info, verbose=True)
    print("\n" + "="*50)
    print(hard_filtered_jobs_ids)

In [None]:
if False:
    persona_id = 'persona_011'

    hard_filtered_jobs_ids, selected_jobs_ids, rationale = matching_agent_process_jobs(persona_id, verbose=True)

    print(f"hard_filtered_jobs_ids : {hard_filtered_jobs_ids}")
    print(f"selected_jobs_ids : {selected_jobs_ids}")
    print(f"rationale : {rationale}")

# redo Persona

In [18]:
person_id = 'persona_07'

filename = f"{output_prefix}_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

persona = personas[person_id]
personas_data[person_id] = initial_personas_data[person_id]
persona_data = json.loads(initial_personas_data[person_id])

if persona.recommendation_type == "jobs_trainings":
    print(f"persona {person_id} without job proposed")
    hard_filtered_jobs_ids, selected_jobs_ids, rationale = matching_agent_process_jobs(person_id, verbose=True)
    persona_data['hard_filtered_jobs_ids'] = hard_filtered_jobs_ids
    persona_data['proposed_job_ids'] = selected_jobs_ids
    persona_data['rationale'] = rationale
    personas_data[person_id] = json.dumps(persona_data)
    print(persona_data)

save_json(personas_save_path, personas_data)

persona persona_018 without job proposed
filtered_jobs_ids : ['j50', 'j51', 'j52', 'j53', 'j54', 'j55', 'j56', 'j57', 'j58', 'j59']
j50
excluded because of location : Brasília - São Paulo
j51
Location OK
j52
Location OK
j53
excluded because of location : Belo Horizonte - São Paulo
j54
excluded because of location : Salvador - São Paulo
j55
excluded because of location : Porto Alegre - São Paulo
j56
excluded because of location : Curitiba - São Paulo
j57
Location OK
j58
excluded because of location : Curitiba - São Paulo
j59
Location OK
hard_filtered_jobs_ids : ['j51', 'j52', 'j57', 'j59']

You are a job advisor expert in matching jobs to candidate profiles.

Your task is to decide which jobs directly or partially support the candidate's goals.

Candidate goal:
training if it helps with product development in paper or pulping processes

You are given a list of jobs:
Job with ID: j51
- Title: Assistant – Fiber Design
- Description: Supports fiber and paper operations by ensuring product 

# Statistics

In [50]:
filename = f"{output_prefix}_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

for person_id in tqdm(personas):
    persona = personas[person_id]
    personas_data[person_id] = personas_data[person_id]
    persona_data = json.loads(personas_data[person_id])
    
    if persona.recommendation_type == "jobs_trainings":
        if 'proposed_job_ids' not in persona_data:
            length = 0
        else:
            length = len(persona_data['proposed_job_ids'])
        if length == 0:
            print(person_id)


100%|██████████| 100/100 [00:00<00:00, 84682.09it/s]

persona_041
persona_048
persona_069
persona_077
persona_079



