In [3]:
!pip install strands-agents[mistral] python-dotenv dictdiffer

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting dictdiffer
  Downloading dictdiffer-0.9.0-py2.py3-none-any.whl.metadata (4.8 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.13.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.19.0-py3-none-any.whl.metadata (85 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_s

In [1]:
import json
import os
import sys
import dotenv

import pprint
import copy

import yaml
from pathlib import Path, PosixPath
from typing import Dict, List, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

from collections import Counter

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    compute_stat_for_multi_items
)

from src.models.activity_domain_info import ActivityDomainLabelingReply
from src.models.job_info import (
    JobInfo
)

from src.models.skill_info import(
    SkillLabelingReply,
    SkillDomainLabelingReply
)

from src.prompts.job_extraction_prompt import(
    DOMAIN_LABELING_PROMPT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("❌ No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("✅ API key found, we're ready to roll")

✅ API key found, we're ready to roll


In [2]:
DATA_JOBS_DIR = Path('../data_jobs')
DATA_ACTIVITIES_DOMAINS_DIR = Path('../data_activities_domains')

In [3]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

activity_domains_version = config["activity_domains_version"]
print(f"activity_domains_version version: {activity_domains_version}")

job_data_version version: v4
activity_domains_version version: v4


In [4]:
# Load jobs data
filename = f"clusterized_jobs_{job_data_version}.json"
jobs_save_path = DATA_JOBS_DIR / filename
jobs_data = read_json(jobs_save_path)

# Convert to JobInfo objects
jobs_info = {
    job_id: JobInfo.model_validate_json(data)
    for job_id, data in jobs_data.items()
}

print(f"✅ Loaded {len(jobs_info)} jobs")
print("\n" + "="*50)

✅ Loaded 200 jobs



# Functions definition

In [5]:
def compute_activity_domain_label(
    jobs_list,
    model: str = "mistral-medium-latest",
    print_prompt: bool = False
) -> ActivityDomainLabelingReply:

    jobs_str = ""
    for id in jobs_list:
        job_info = jobs_info[id]
        jobs_str += "- " + job_info.title + ': ' + job_info.job_description + "\n"

    prompt = DOMAIN_LABELING_PROMPT.format(
        jobs_description=jobs_str
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model)
    result = extraction_agent.structured_output(output_model=ActivityDomainLabelingReply, prompt=prompt)

    return result

# Build structured hierarchy

In [6]:
# print(jobs_data)

jobs_map = {}
for id in jobs_data:
    job_info = jobs_info[id]

    activity_domains = job_info.domains
    for domain in activity_domains:
        if domain not in jobs_map:
            jobs_map[domain] = []

        jobs_map[domain].append(id)

print(jobs_map)

{'CLUSTER_7': ['j0', 'j1', 'j10', 'j12', 'j13', 'j14', 'j15', 'j16', 'j19', 'j2', 'j3', 'j4', 'j5', 'j6', 'j7', 'j8', 'j9'], 'CLUSTER_9': ['j100', 'j101', 'j102', 'j103', 'j104', 'j105', 'j106', 'j107', 'j108', 'j109'], 'CLUSTER_13': ['j11', 'j17', 'j18', 'j90', 'j91', 'j92', 'j93', 'j94', 'j95', 'j96', 'j97', 'j98', 'j99'], 'CLUSTER_4': ['j110', 'j111', 'j112', 'j113', 'j114', 'j115', 'j116', 'j117', 'j118', 'j119', 'j120', 'j121', 'j122', 'j123', 'j124', 'j125', 'j126', 'j127', 'j128', 'j129', 'j180', 'j184', 'j187'], 'CLUSTER_12': ['j130', 'j131', 'j132', 'j133', 'j134', 'j135', 'j136', 'j137', 'j138', 'j150', 'j151', 'j153', 'j154', 'j155', 'j156', 'j157', 'j158', 'j159', 'j182', 'j183', 'j185', 'j186'], 'CLUSTER_5': ['j139', 'j80', 'j81', 'j82', 'j83', 'j84', 'j85', 'j86', 'j87', 'j88', 'j89'], 'CLUSTER_8': ['j140', 'j141', 'j142', 'j143', 'j144', 'j145', 'j146', 'j147', 'j148', 'j149'], 'CLUSTER_11': ['j152', 'j181', 'j188', 'j189', 'j40', 'j41', 'j42', 'j43', 'j44', 'j45', 'j46'

# Label Activity Domain fields

In [10]:
extended_jobs_map = {}

for domain in tqdm(jobs_map):        
    result = compute_activity_domain_label(
        jobs_map[domain],
        model="mistral-medium-latest",
        print_prompt=False)

    print(f"LABEL : {result.activity_domain_label}")
    print(f"DESCRIPTION : {result.description}")
    print(f"RATIONALE : {result.rationale}")
    print("")

    job_map = {}
    job_map['description'] = result.description
    job_map['job_ids'] = jobs_map[domain]
    
    label = result.activity_domain_label

    # extended_jobs_map[label] = jobs_map[domain]
    extended_jobs_map[label] = job_map

print(extended_jobs_map)

job_data_version = 'v5'

filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
save_json(save_path, extended_jobs_map)

  7%|▋         | 1/15 [00:03<00:52,  3.74s/it]

LABEL : Financial Operations And Regulatory Compliance
DESCRIPTION : Covers accounting, banking, and insurance sectors focusing on compliance, risk management, and financial operations.
RATIONALE : The proposed domain name captures the core activities present across the listed jobs, which primarily revolve around financial operations, regulatory compliance, and risk management. The primary industry sectors identified in the job data are banking, accounting, and insurance.  

Outliers: None of the jobs appear to be significant outliers in function, as they all relate to financial operations, compliance, or risk management within the specified sectors.



 13%|█▎        | 2/15 [00:06<00:43,  3.32s/it]

LABEL : Legal Support And Compliance Analysis
DESCRIPTION : Focuses on legal research, compliance, and document drafting in sectors like law and corporate governance.
RATIONALE : The proposed domain name captures the core activities of legal research, compliance, and support present across all listed jobs. The description highlights the primary sectors identified, such as law and corporate governance, while also mentioning the focus areas of legal research and document drafting. No significant outliers were detected in the job functions, and the description remains concise and informative.



 20%|██        | 3/15 [00:12<00:51,  4.25s/it]

LABEL : Insurance Operations And Compliance Management
DESCRIPTION : Focuses on regulatory adherence, risk assessment, and operational management within the Insurance and Banking sectors.
RATIONALE : The proposed domain name captures the core functions of regulatory compliance, risk management, and operational oversight that are central to the listed job roles. The description explicitly mentions the primary industry sectors identified in the job data, which are Insurance and Banking. This domain name is specific enough to reflect the unique character of the group while being generic enough to encompass all the listed roles. No significant outliers were detected in the job functions as all roles primarily revolve around compliance, risk assessment, and operational management within the specified sectors.



 27%|██▋       | 4/15 [00:17<00:52,  4.75s/it]

LABEL : Maritime And Fleet Logistics Operations
DESCRIPTION : Covers port management, vessel coordination, logistics, and fleet maintenance in maritime and transportation sectors.
RATIONALE : The proposed domain name 'Maritime and Fleet Logistics Operations' captures the core activities present in the job listings. The jobs primarily revolve around port operations, vessel coordination, logistics, and fleet maintenance. The primary industry sectors identified are maritime and transportation. Outliers such as 'Marine Assistant – Onboard Logistics' and 'Specialist – Marine Environmental Protection' still fit within the broader context of maritime operations, while 'Fleet Diagnostics Analyst' and 'Fleet Coordination Trainee' introduce the transportation sector, which is also included in the description for completeness.



 33%|███▎      | 5/15 [00:22<00:46,  4.65s/it]

LABEL : Industrial Production And Live Performance Technical Operations
DESCRIPTION : Covers technical roles in industrial manufacturing and live performance sectors, including equipment maintenance, production optimization, and stagecraft management.
RATIONALE : The proposed activity domain name captures the essence of both the industrial production and live performance sectors, which are the primary industry sectors identified in the job data. The jobs listed primarily involve technical operations, equipment maintenance, and process optimization in industrial manufacturing, as well as stagecraft, lighting, audio, and rigging in live performances. The name is generic enough to encompass all the jobs while being specific to highlight the unique technical and operational aspects of these sectors. The description explicitly mentions the primary industry sectors and provides additional details about the types of roles included.



 40%|████      | 6/15 [00:26<00:40,  4.45s/it]

LABEL : Industrial Health, Safety, and Environmental Compliance
DESCRIPTION : Ensures workplace safety and regulatory compliance in industrial and production sectors.
RATIONALE : The proposed domain name captures the core focus of the listed jobs, which revolve around health, safety, and environmental compliance within industrial settings. The primary industry sectors identified in the job data are production and industrial operations. The description explicitly mentions these sectors and highlights the key responsibilities of ensuring workplace safety and regulatory compliance. No significant outliers were detected among the jobs listed, as all roles are closely related to health, safety, and environmental compliance in industrial contexts.



 47%|████▋     | 7/15 [00:29<00:33,  4.23s/it]

LABEL : Procurement And Supply Chain Optimization
DESCRIPTION : Focuses on managing vendor relationships, optimizing procurement processes, and mitigating supply chain risks primarily in the retail, manufacturing, and logistics sectors.
RATIONALE : The proposed domain name captures the core activities of procurement and supply chain management, which are central to all the listed jobs. The description highlights the primary industry sectors identified in the job data, namely retail, manufacturing, and logistics. The justification for this domain name is that it encompasses the key responsibilities mentioned in the job descriptions, such as vendor management, cost strategies, risk mitigation, and procurement processes. No significant outliers were detected in the job functions, but the inclusion of ERP systems and data analysis adds a layer of technological and analytical expertise to the domain.



 53%|█████▎    | 8/15 [00:33<00:27,  3.94s/it]

LABEL : Electrical And Electronics Systems Engineering And Maintenance
DESCRIPTION : Focuses on electrical and electronics systems across automotive, live events, and power sectors.
RATIONALE : The proposed domain name captures the core activities of electrical and electronics systems engineering and maintenance, which are common across all listed jobs. The primary industry sectors identified include automotive, live events, and power systems. The job titles primarily involve roles related to electrical and electronic systems, including design, analysis, maintenance, and project coordination. No significant outliers were detected as all jobs are aligned with the engineering and maintenance aspects of electrical and electronic systems.



 60%|██████    | 9/15 [00:36<00:22,  3.76s/it]

LABEL : Community Engagement And Cultural Development
DESCRIPTION : Focuses on social, educational, and cultural initiatives within community development sectors.
RATIONALE : The proposed domain name captures the essence of the jobs listed, which primarily revolve around community engagement, social initiatives, and cultural development. The description explicitly mentions the primary sectors identified: social, educational, and cultural initiatives. No jobs appear to be significant outliers in function, as all roles are centered around community and cultural development. The domain name is generic enough to encompass all listed jobs while being specific to the unique character of community-focused roles.



 67%|██████▋   | 10/15 [00:41<00:19,  3.98s/it]

LABEL : Tourism And Hospitality Guest Services
DESCRIPTION : Focuses on guest experience and operational roles within the Tourism and Hospitality sectors.
RATIONALE : The proposed domain label captures the essence of the roles provided, which predominantly revolve around enhancing guest experiences and managing operations within the tourism and hospitality sectors. The jobs listed primarily involve customer service, event coordination, guest relations, and operational management, all of which are central to the tourism and hospitality industries. No significant outliers were detected, as all roles align well with the core functions of these sectors.



 73%|███████▎  | 11/15 [00:46<00:17,  4.50s/it]

LABEL : Visual Arts And Studio Production
DESCRIPTION : This domain covers roles in visual arts creation, management, and coordination within the studio production sector.
RATIONALE : The proposed domain name captures the core activities of visual arts creation and studio production management, which are central to all listed jobs. The description explicitly mentions the primary sector identified, which is studio production. No significant outliers were detected as all jobs revolve around visual arts and studio operations.



 80%|████████  | 12/15 [00:52<00:14,  4.98s/it]

LABEL : Cultural Information And Records Management
DESCRIPTION : Managing and preserving cultural and organizational records in archives, libraries, and documentation systems.
RATIONALE : The proposed domain name captures the core activities of managing, preserving, and organizing cultural and organizational records, which are common across all listed jobs. The primary industry sectors identified in the job data include cultural archives, libraries, and general organizational records management. All jobs fit well within this domain as they involve handling cultural materials, records, and documentation systems. No significant outliers were detected in terms of function, as all roles are centered around the management and preservation of information and cultural heritage.



 87%|████████▋ | 13/15 [00:55<00:08,  4.33s/it]

LABEL : Design Research And Innovation Management
DESCRIPTION : Focuses on user research and innovation management primarily in product development and design sectors.
RATIONALE : The proposed domain name captures the core activities of design research and innovation management evident in the job listings. The description highlights the primary sectors of product development and design. No significant outliers were detected as all jobs revolve around design research, user experience studies, and innovation management.



 93%|█████████▎| 14/15 [00:59<00:04,  4.21s/it]

LABEL : Fiber And Paper Manufacturing Operations
DESCRIPTION : This domain focuses on roles within the fiber and paper manufacturing and processing sectors.
RATIONALE : The proposed domain name captures the core activities and industry sectors identified in the job data, which primarily revolve around fiber and paper manufacturing and processing. The jobs listed are mainly centered on production optimization, quality control, research and development, and sustainable practices within these sectors. No significant outliers were detected in the job functions provided.



100%|██████████| 15/15 [01:05<00:00,  4.35s/it]

LABEL : Food Production And Quality Management
DESCRIPTION : This domain covers roles in the Food Industry focusing on production, quality, safety, and innovation.
RATIONALE : The proposed domain name captures the core activities present in the job listings, which primarily revolve around food production, quality assurance, and safety management. The description explicitly mentions the primary industry sector identified, which is the Food Industry. The jobs listed are mostly aligned with these activities, ensuring the domain name is both generic enough to fit all jobs and specific enough to capture their unique character. No significant outliers were detected in terms of function, as all roles are closely related to food production and quality management.

{'Financial Operations And Regulatory Compliance': {'description': 'Covers accounting, banking, and insurance sectors focusing on compliance, risk management, and financial operations.', 'job_ids': ['j0', 'j1', 'j10', 'j12', 'j13', '




# Add job title to map for better analysis

In [34]:
filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
jobs_map = read_json(save_path)

# print(jobs_map)

In [38]:
extended_jobs_map = {}
for domain in jobs_map:
    extended_jobs_map[domain] = copy.deepcopy(jobs_map[domain])
    # print(extended_jobs_map)
    # print(type(jobs_map[domain]['job_ids']))
    # print(jobs_map[domain]['job_ids'])
    extended_jobs_map[domain]['job_ids'] = {}
    for id in jobs_map[domain]['job_ids']:
        # print(jobs_map[domain])
        job_info = jobs_info[id]
        extended_jobs_map[domain]['job_ids'][id] = job_info.title

# print(extended_jobs_map)
filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
save_json(save_path, extended_jobs_map)

# Apply clusters labels to job list

In [42]:
filename = f"map_clusters_jobs_{job_data_version}.json"
save_path = DATA_JOBS_DIR / filename
jobs_map = read_json(save_path)

for domain in jobs_map:
    for id in jobs_map[domain]['job_ids']:
        job_data_dict = json.loads(jobs_data[id])
        job_data_dict['domain'] = domain
        job_data_dict['domains'] = [domain]
        job_data = json.dumps(job_data_dict, ensure_ascii=False)
        jobs_data[id] = job_data

filename = f"extended_jobs_{job_data_version}.json"
jobs_save_path = DATA_JOBS_DIR / filename
save_json(jobs_save_path, jobs_data)