In [1]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.12.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.17.0-py3-none-any.whl.metadata (80 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.58b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [1]:
import json
import os
import sys
import dotenv

import pprint
import ast

import yaml
from pathlib import Path, PosixPath
from typing import Dict, List, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

from collections import Counter

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    compute_stat_for_multi_items
)

from src.models.job_info import (
    JobInfo,
    JobInforequired_skills_domains
)
from src.prompts.job_extraction_prompt import JOB_SUMMARY_PROMPT

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("❌ No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("✅ API key found, we're ready to roll")

✅ API key found, we're ready to roll


In [2]:
DATA_JOBS_DIR = Path('../data_jobs')

In [3]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

job_data_version version: v4


In [4]:
# Load jobs data
filename = f"skill_domain_classified_jobs_{job_data_version}.json"
jobs_save_path = DATA_JOBS_DIR / filename
jobs_data = read_json(jobs_save_path)

# Convert to JobInfo objects
jobs_info = {
    job_id: JobInfo.model_validate_json(data)
    for job_id, data in jobs_data.items()
}

print(f"✅ Loaded {len(jobs_info)} jobs")
print("\n" + "="*50)

✅ Loaded 200 jobs



In [5]:
# Load job descriptions
job_descriptions = {}
job_paths = get_job_paths()
for path in tqdm(job_paths):
    id_ = path.stem
    text = load_file_content(path)
    job_descriptions[id_]=text

100%|██████████| 200/200 [00:00<00:00, 1893.24it/s]


In [7]:
def compute_job_summary(
    job_description,
    model: str = "mistral-small-latest",
    print_prompt: bool = False
) -> str:
    """Extract training info from file using Training Extraction Agent"""
    
    prompt = JOB_SUMMARY_PROMPT + job_description

    if print_prompt is True:
        print(prompt)
    
    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent(prompt=prompt)

    return str(result)

In [8]:
if True:
    job_id = 'j65'
    result = compute_job_summary(
        job_descriptions[job_id],
        print_prompt=False
    )

    print("RESULT:")
    print(result)

RESULT:
The Junior Food Manufacturing Coordinator supports food production operations by coordinating manufacturing processes and supply chain activities. The role involves scaling recipes to meet production demands, sourcing raw materials, coordinating production schedules, and maintaining accurate records of production activities. This remote position is ideal for recent graduates in Brazil with a technical degree and fluent Portuguese.



In [9]:
MAX_LOOPS = 1
cache_period = 5

#####
# Load final jobs file
#####
filename = f"final_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename

if not save_path.exists():
    save_json(save_path, {})
extended_jobs_data = read_json(save_path)

# print(trainings_map)

print("START CLASSIFICATION LOOP")
for i in range(MAX_LOOPS):
    print("ITERATE CLASSIFICATION LOOP")

    #####
    # Collect Jobs with skill domain = UNKNOWN
    #####
    filt_jobs_ids = []
    for job_id in jobs_info:
        # print(jobs_info)
        if job_id not in extended_jobs_data:
            filt_jobs_ids.append(job_id)
            continue

        ext_job_data = json.loads(extended_jobs_data[job_id])

        if len(ext_job_data['job_description']) == 0:
            filt_jobs_ids.append(job_id)
            continue

    print(f"Number of jobs not classisfied : {len(filt_jobs_ids)}")

    if len(filt_jobs_ids) == 0:
        break
    
    #####
    # Start jobs classification
    #####
    print("Start jobs classification")
    iteration = 0
    for job_id in tqdm(filt_jobs_ids):
        iteration = iteration + 1

        print_prompt = False
        # if iteration == 1:
        #     print_prompt = True

        result = compute_job_summary(
            job_descriptions[job_id],
            print_prompt=print_prompt
        )

        if iteration == 1:
            print(result)
        
        job_data = json.loads(jobs_data[job_id])
        job_data['job_description'] = result
        extended_jobs_data[job_id] = json.dumps(job_data, ensure_ascii=False)

        if iteration % cache_period == 0:
            save_json(save_path, extended_jobs_data)

        # if iteration > 0:break

save_json(save_path, extended_jobs_data)

START CLASSIFICATION LOOP
ITERATE CLASSIFICATION LOOP
Number of jobs not classisfied : 200
Start jobs classification


  0%|          | 1/200 [00:00<01:56,  1.71it/s]

Accounting Intern – Bookkeeping & Admin

Maintain accurate financial records, process transactions, and manage tax documentation and compliance. Support administrative functions and collaborate with senior accounting staff and other departments to ensure smooth financial operations.



100%|██████████| 200/200 [03:07<00:00,  1.07it/s]
