In [2]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.11.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.16.0-py3-none-any.whl.metadata (80 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.58b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [1]:
import json
import os
import sys
import dotenv
import html

import pprint

import yaml
from pathlib import Path, PosixPath
from typing import Dict, List, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

from collections import Counter

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    compute_stat_for_multi_items
)

from src.models.activity_domain_info import ActivityDomainInfo
from src.models.training_info import TrainingInfo
from src.models.job_info import JobInfo, JobInfoRequiredSkills

from src.prompts.job_extraction_prompt import (
    FIND_TRAINING_MATCHES_PROMPT
)

from src.models.generic_models import (
    ListOfIds
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("‚ùå No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("‚úÖ API key found, we're ready to roll")

‚úÖ API key found, we're ready to roll


In [2]:
DATA_PERSONAS_INFO_DIR = Path('../data_personas_info')
DATA_JOBS_DIR = Path('../data_jobs')
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_SKILLS_DOMAINS_DIR = Path('../data_skills_domains')
DATA_MATCH_JOBS_TRAININGS_DIR = Path('../data_match_jobs_trainings')

In [3]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

personas_info_data_version = config["personas_info_data_version"]
print(f"personas_info_data_version version: {personas_info_data_version}")

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

skill_domains_version = config["skill_domains_version"]
print(f"skill_domains_version version: {skill_domains_version}")

match_skills_domains_trainings_data_version = f"{skill_domains_version}_{training_data_version}"
match_jobs_trainings_data_version = f"{job_data_version}_{training_data_version}"

personas_info_data_version version: v10
job_data_version version: v4
training_data_version version: v7
skill_domains_version version: v3


In [4]:
# Load jobs data
filename = f"skill_domain_classified_jobs_{job_data_version}.json"
jobs_save_path = DATA_JOBS_DIR / filename
jobs_data = read_json(jobs_save_path)

# Convert to JobInfo objects
jobs_info = {
    job_id: JobInfo.model_validate_json(data)
    for job_id, data in jobs_data.items()
}

print(f"‚úÖ Loaded {len(jobs_info)} jobs")
print("\n" + "="*50)

‚úÖ Loaded 200 jobs



In [5]:
# Load job descriptions
job_descriptions = {}
job_paths = get_job_paths()
for path in tqdm(job_paths):
    id_ = path.stem
    text = load_file_content(path)
    job_descriptions[id_]=text

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:00<00:00, 28400.34it/s]


In [6]:
# Load skills domains data
filename = f"final_map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)

print(f"‚úÖ Loaded {len(trainings_map)} skills domains")
print("\n" + "="*50)

‚úÖ Loaded 12 skills domains



In [7]:
def extend_match_jobs_training(
    job_description,
    trainings_list,
    model: str = "mistral-small-latest",
    print_prompt=False
) -> JobInfoRequiredSkills:

    prompt = FIND_TRAINING_MATCHES_PROMPT.format(
        job_description="-----------\n" + job_description + "\n-----------",
        formatted_skills=trainings_list
    )

    if print_prompt is True:
        print(prompt)

    # return None
    
    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=JobInfoRequiredSkills, prompt=prompt)

    if hasattr(extraction_agent, 'last_response'):
        track_api_call(extraction_agent.last_response, model)

    return result

In [8]:
if False:
    job_id = 'j1'
    
    training_ids = []
    for required_skills_domain in jobs_info[job_id].required_skills_domains:
        training_ids.extend(skill_domain_training_map[required_skills_domain])
    
    skills_str = ""
    for training_id in training_ids:
        skills_str += trainings_info[training_id].get_skill_acquired(training_id) + "\n"
    
    selected_trainings = extend_match_jobs_training(jobs_info[job_id].get_required_skills(), skills_str, print_prompt=True)
    print(selected_trainings)

In [None]:
MAX_LOOPS = 1
cache_period = 5

# Process all jobs
print("üìÇ Processing Jobs...")

filename = f"match_jobs_trainings_{match_jobs_trainings_data_version}.json"
save_path = DATA_MATCH_JOBS_TRAININGS_DIR / filename
if not save_path.exists():
    save_json(save_path, {})
job_training_map = read_json(save_path)

new_items_processed = 0

print("START CLASSIFICATION LOOP")
for i in range(MAX_LOOPS):
    print("ITERATE CLASSIFICATION LOOP")

    filt_jobs_ids = []
    for job_id in jobs_info:
        # print(jobs_info)
        if job_id not in job_training_map:
            filt_jobs_ids.append(job_id)
            continue

        # trainings_list = job_training_map[job_id]

        # if len(trainings_list) == 0:
        #     filt_jobs_ids.append(job_id)
        #     continue
        # else:
        #     for training_id in trainings_list:
        #         if training_id not in trainings_data:
        #             # print(f"Training id error : {training_id}")
        #             filt_jobs_ids.append(job_id)
        #             break

    print(f"Number of jobs not classisfied : {len(filt_jobs_ids)}")

    if len(filt_jobs_ids) == 0:
        break

    for job_id in tqdm(filt_jobs_ids):
        new_items_processed = new_items_processed + 1

        print_prompt = False
        if new_items_processed == 1:
            print_prompt = True
        
        job_info = jobs_info[job_id]

        # print(job_info)
        skills_str = ""
        for domain in job_info.required_skills_domains:
            if domain in trainings_map:
                for skill in trainings_map[domain]:
                    skills_str += "- " + domain + " / " + skill + "\n"
            else:
                print(f"ERROR : skill domain ({domain}) not referenced in skill domain training map for job {job_id}")
                        
        result = extend_match_jobs_training(
            job_descriptions[job_id],
            skills_str,
            model = "mistral-medium-latest",
            print_prompt=print_prompt
        )

        # print(result)

        job_training_map[job_id] = result.required_skills

        if new_items_processed % cache_period == 0:
            save_json(save_path, job_training_map)
            
        # print(job_training_map[job_id])
        
        # if new_items_processed > 0:break

# Save results
save_json(save_path, job_training_map)

print(f"\n‚úÖ Generated matching for {len(job_training_map)} jobs")
print(f"üìÅ Results saved to: {save_path}")

üìÇ Processing Jobs...
START CLASSIFICATION LOOP
ITERATE CLASSIFICATION LOOP
Number of jobs not classisfied : 200


  0%|          | 0/200 [00:00<?, ?it/s]


You are an expert in skill taxonomy, training analysis, and job description interpretation.

Your task is to identify which skill ‚Äîamong a predefined list‚Äîcover the skills required for the job described below.

# Job Description:
-----------
# Job Description: Accounting Intern ‚Äì Bookkeeping & Admin

**Position Summary:**
As an **Accounting Intern ‚Äì Bookkeeping & Admin** on our **Accounting and Management** team, you'll handle day-to-day financial record keeping and administrative tasks that keep our operations running smoothly.

**Your Responsibilities:**
Your main tasks will include maintaining accurate financial records and transaction entries, managing tax-related documentation and compliance requirements, and supporting general administrative functions across the accounting department. You'll work closely with senior accounting staff and other departments that need financial data and reporting.

**What We're Looking For:**
You should have solid experience with **tax regul

  4%|‚ñç         | 9/200 [00:56<26:21,  8.28s/it]