In [1]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.12.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.17.0-py3-none-any.whl.metadata (80 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.58b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [1]:
import json
import os
import sys
import dotenv

import pprint

import yaml
from pathlib import Path, PosixPath
from typing import Dict, List, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

from collections import Counter

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    compute_stat_for_multi_items
)

from src.models.skill_domain_info import SkillDomainInfo
from src.models.training_info import (
    TrainingInfo
)

from src.models.skill_info import(
    SkillLabelingReply,
    SkillDomainLabelingReply
)

from src.prompts.training_extraction_prompt import(
    SKILL_LABELING_PROMPT,
    SKILL_DOMAIN_LABELING_PROMPT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("❌ No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("✅ API key found, we're ready to roll")

✅ API key found, we're ready to roll


In [2]:
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_SKILLS_DOMAINS_DIR = Path('../data_skills_domains')

In [3]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

skill_domains_version = config["skill_domains_version"]
print(f"skill_domains_version: {skill_domains_version}")

training_data_version version: v7
skill_domains_version: v3


In [4]:
# Load training data
filename = f"clusterized_trainings_{training_data_version}.json"
trainings_save_path = DATA_TRAININGS_DIR / filename
trainings_data = read_json(trainings_save_path)

# Convert to TrainingInfo objects
trainings_info = {
    training_id: TrainingInfo.model_validate_json(data)
    for training_id, data in trainings_data.items()
}

print(f"✅ Loaded {len(trainings_info)} trainings")
print("\n" + "="*50)

✅ Loaded 497 trainings



# Functions definition

In [5]:
def compute_skill_acquired_label(
    trainings_list,
    model: str = "mistral-small-latest",
    print_prompt: bool = False
) -> SkillLabelingReply:

    trainings_str = ""
    for t_id in trainings_list:
        training_info = trainings_info[t_id]
        trainings_str += training_info.summarize_for_skill_acquired_labeling(t_id) + "\n"

    # print(trainings_str)

    prompt = SKILL_LABELING_PROMPT.format(
        trainings_description=trainings_str
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model)
    result = extraction_agent.structured_output(output_model=SkillLabelingReply, prompt=prompt)

    return result

In [16]:
def compute_skill_domain_label(
    skills_list,
    model: str = "mistral-small-latest",
    print_prompt: bool = False
) -> SkillDomainLabelingReply:

    skills_str = ""
    for skill in skills_list:
        skills_str += "- " + skill + "\n"

    prompt = SKILL_DOMAIN_LABELING_PROMPT.format(
        trainings_description=skills_str
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model)
    result = extraction_agent.structured_output(output_model=SkillDomainLabelingReply, prompt=prompt)

    return result

# Build structured hierarchy

In [7]:
trainings_map = {}
for t_id in trainings_data:
    training_info = trainings_info[t_id]

    skill_domain = training_info.skill_domain
    if skill_domain not in trainings_map:
        trainings_map[skill_domain] = {}

    skill_acquired = training_info.skill_acquired
    if skill_acquired not in trainings_map[skill_domain]:
        trainings_map[skill_domain][skill_acquired] = []

    trainings_map[skill_domain][skill_acquired].append(t_id)

# Label Skill Acquired fields

In [None]:
extended_trainings_map = {}

for skill_domain in tqdm(trainings_map):
    extended_trainings_map[skill_domain] = {}
    for skill_acquired in tqdm(trainings_map[skill_domain]):
        trainings_list = trainings_map[skill_domain][skill_acquired]
        # print(trainings_list)

        result = compute_skill_acquired_label(trainings_list, print_prompt=False)
        label = result.skill_label

        if label not in extended_trainings_map[skill_domain]:
            extended_trainings_map[skill_domain][label] = []

        extended_trainings_map[skill_domain][label] += trainings_list

filename = f"map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
save_json(save_path, extended_trainings_map)

# Label Skill Domain fields

In [7]:
filename = f"map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)

In [21]:
extended_trainings_map = {}

for skill_domain in tqdm(trainings_map):
    result = compute_skill_domain_label(trainings_map[skill_domain], print_prompt=False)
    label = result.skill_domain_label

    extended_trainings_map[label] = trainings_map[skill_domain]

    print(f"LABEL : {label}")
    print(f"RATIONALE : {result.rationale}")
    print("")    

filename = f"map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
save_json(save_path, extended_trainings_map)

  8%|▊         | 1/12 [00:01<00:12,  1.15s/it]

LABEL : Financial Risk Management And Compliance
RATIONALE : The majority of the training topics listed revolve around financial risk management, compliance, and related analytical and operational skills within the financial and insurance sectors. These include financial analysis, regulatory compliance, risk assessment, fraud detection, and insurance-specific skills. The outliers, such as 'Customer Information Systems Management', 'Foundational Troubleshooting', and 'Cross-Functional Coordination And Communication', are more general and could apply to various domains, so they do not influence the skill domain name. The chosen skill domain name captures the core themes of financial risk, compliance, and related analytical skills.



 17%|█▋        | 2/12 [00:04<00:21,  2.20s/it]

LABEL : Electrical And Electronic Systems Engineering
RATIONALE : The provided training list primarily revolves around skills related to electrical and electronic systems, including programming, power distribution, safety management, circuit design, diagnostics, and troubleshooting. The proposed skill domain name captures the essence of these trainings. There are no significant outliers in the list that would warrant a different categorization.



 25%|██▌       | 3/12 [00:07<00:22,  2.55s/it]

LABEL : Food Safety And Management
RATIONALE : The majority of the training topics listed are related to food safety, quality management, and operational efficiency within the food industry. These include specific areas such as food safety management, food safety and sanitation, food safety and quality assessment, food logistics management, and food waste management. The inclusion of chemical safety and management in industrial environments, sustainable waste management, and safety compliance and regulatory interpretation, while relevant, are more broadly applicable and do not significantly alter the core focus on food safety and management. Therefore, these broader topics are considered outliers and do not influence the skill domain name.



 33%|███▎      | 4/12 [00:08<00:17,  2.17s/it]

LABEL : Fiber And Paper Industry Operations
RATIONALE : The provided training topics revolve around various aspects of the fiber and paper industry, including environmental compliance, material science, production, quality control, and regulatory compliance. These topics are closely related and specific to the operations within the fiber and paper industry. The outlier in this group is 'Operational Risk And Port Management', which does not directly relate to the core operations of the fiber and paper industry. However, it does not significantly influence the skill domain name as the majority of the topics are clearly centered around fiber and paper industry operations.



 42%|████▏     | 5/12 [00:10<00:13,  1.89s/it]

LABEL : Industrial Equipment Maintenance And Optimization
RATIONALE : The skill domain name 'Industrial Equipment Maintenance And Optimization' captures the essence of the training group, which focuses on various aspects of maintaining, operating, and optimizing industrial equipment. This label is broad enough to include all the listed trainings but specific enough to highlight the core theme of industrial equipment maintenance and optimization. The training 'Automotive Diagnostic And Troubleshooting' could be considered an outlier as it is more specific to the automotive industry, but it still fits within the broader context of equipment maintenance and troubleshooting.



 50%|█████     | 6/12 [00:11<00:09,  1.59s/it]

LABEL : Procurement And Supply Chain Management
RATIONALE : The training topics listed predominantly revolve around procurement, supply chain management, and related strategic planning and negotiation skills. The skill domain name 'Procurement And Supply Chain Management' captures the essence of these training topics. There are no significant outliers in the list that would warrant a different skill domain name.



 58%|█████▊    | 7/12 [00:11<00:06,  1.38s/it]

LABEL : Hospitality And Tourism Management
RATIONALE : The training topics listed are predominantly focused on skills and knowledge required in the hospitality and tourism industries. They cover areas such as guest services, event management, cultural competency, and tourism marketing. The skill domain name 'Hospitality And Tourism Management' encompasses all these areas effectively. There are no significant outliers that would necessitate a broader or narrower domain name.



 67%|██████▋   | 8/12 [00:12<00:04,  1.25s/it]

LABEL : Legal Practice And Advocacy
RATIONALE : The provided training list predominantly revolves around core legal skills, including client interaction, case analysis, document creation, courtroom procedures, and negotiation. These are central to the practice of law and advocacy. The term 'Legal Practice And Advocacy' encompasses all these areas effectively. No significant outliers are detected that would skew the domain name.



 75%|███████▌  | 9/12 [00:13<00:03,  1.13s/it]

LABEL : Maritime And Port Operations Management
RATIONALE : The training topics listed are primarily focused on maritime and port operations, including logistics, safety, compliance, and coordination. The skill domain name 'Maritime And Port Operations Management' captures the essence of these training topics. There are no significant outliers that would influence the skill domain name, as all topics are closely related to maritime and port management.



 83%|████████▎ | 10/12 [00:14<00:02,  1.01s/it]

LABEL : Visual And Artistic Skills
RATIONALE : The training topics listed revolve around visual arts and artistic techniques, including interpretation, creation, and presentation. All topics fit well under the broad category of visual and artistic skills. There are no significant outliers that would warrant a different categorization.



 92%|█████████▏| 11/12 [00:16<00:01,  1.18s/it]

LABEL : Information Management And Digital Security
RATIONALE : The skill domain name 'Information Management And Digital Security' captures the essence of the training group, which primarily focuses on managing, securing, and organizing information and data. The trainings listed involve various aspects of information management, data security, and digital platform management. The term 'Digital Security' is included to reflect the emphasis on data protection, compliance, and security within the digital realm. There are no significant outliers in the list that would skew the skill domain name.



100%|██████████| 12/12 [00:17<00:00,  1.42s/it]

LABEL : Live Event Technical Management
RATIONALE : The provided trainings all revolve around technical and operational aspects of live events, including sound, lighting, electrical systems, and coordination. The term 'Live Event Technical Management' captures the essence of these trainings, emphasizing the technical and managerial skills required in live entertainment and event production. There are no significant outliers in the list that would warrant a different skill domain name.

{'Financial Risk Management And Compliance': {'Financial Software Proficiency': ['tr0', 'tr1', 'tr2'], 'Cost Analysis And Financial Evaluation': ['tr10', 'tr11', 'tr9'], 'Financial Data Analysis': ['tr12', 'tr13'], 'Data Analysis And Compliance': ['tr14', 'tr370'], 'Financial And Compliance Documentation': ['tr15', 'tr54'], 'Financial Statement Preparation And Compliance': ['tr16', 'tr17', 'tr56'], 'Advanced Formula Adjustment': ['tr173'], 'Regulatory Compliance Management': ['tr19', 'tr234', 'tr235', 't




# Add training title to map for better analysis

In [67]:
filename = f"map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)

In [68]:
extended_trainings_map = {}
for skill_domain in trainings_map:
    extended_trainings_map[skill_domain] = {}
    for skill_acquired in tqdm(trainings_map[skill_domain]):
        extended_trainings_map[skill_domain][skill_acquired] = {}
        for tr_id in trainings_map[skill_domain][skill_acquired]:
            training_info = trainings_info[tr_id]
            extended_trainings_map[skill_domain][skill_acquired][tr_id] = training_info.title

filename = f"map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
save_json(save_path, extended_trainings_map)

100%|██████████| 36/36 [00:00<00:00, 184590.40it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 15/15 [00:00<00:00, 121222.66it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 13/13 [00:00<00:00, 116757.93it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 12/12 [00:00<00:00, 143804.71it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 17/17 [00:00<00:00, 132042.90it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 15/15 [00:00<00:00, 130257.89it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 28/28 [00:00<00:00, 243148.06it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 14/14 [00:00<00:00, 181235.36it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 22/22 [00:00<00:00, 216607.25it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 13/13 [00:00<00:00, 167772.16it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 10/10 [00:00<00:00, 132312.43it/s]


{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance

100%|██████████| 9/9 [00:00<00:00, 132918.08it/s]

{'Financial Risk Management And Compliance': {'Financial Software Proficiency': {'tr0': 'Master Financial Software Tools - Beginner Level', 'tr1': 'Intermediate Financial Software Training', 'tr2': 'Advanced Accounting Software Proficiency Program'}, 'Cost Analysis And Financial Evaluation': {'tr10': 'Intermediate Financial Cost Analysis Training', 'tr11': 'Advanced Financial Cost Analysis Training', 'tr9': 'Basic Financial Cost Analysis Training'}, 'Financial Data Analysis': {'tr12': 'Basic Data Analysis Training', 'tr13': 'Intermediate Data Analysis and Interpretation Training'}, 'Data Analysis And Compliance': {'tr14': 'Advanced Analytics and Business Intelligence Training', 'tr370': 'Intermediate Spending Analytics Training'}, 'Financial And Compliance Documentation': {'tr15': 'Basic Financial Reporting Training', 'tr54': 'Basic Financial Compliance Reporting'}, 'Financial Statement Preparation And Compliance': {'tr16': 'Intermediate Corporate Financial Reporting', 'tr17': 'Advance




In [6]:
def extract_extended_training_info(
    training_data,
    model: str = "mistral-small-latest",
    skills_domains: SkillDomainInfo = None,
    print_prompt: bool = False
) -> str:
    """Extract training info from file using Training Extraction Agent"""

    prompt = EXTENDED_TRAINING_EXTRACTION_PROMPT.format(
        skills_domains=skills_domains,
        training_description=training_data
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model)
    result = extraction_agent(prompt=prompt)

    if hasattr(extraction_agent, 'last_response'):
        track_api_call(extraction_agent.last_response, model)

    return result

In [None]:
# print(skills_domains.get_names_bullet_list())
if False:
    training_id = 'tr0'
    result = extract_extended_training_info(
                trainings_info[training_id].describe_short(),
                skills_domains = skills_domains.get_names_bullet_list(),
                print_prompt=True
            )
    print(result)

In [34]:
if False:
    # Load training data
    filename = f"extracted_trainings_v6.json"
    trainings_save_path = DATA_TRAININGS_DIR / filename
    trainings_data = read_json(trainings_save_path)
    
    # Convert to TrainingInfo objects
    trainings_info = {
        training_id: TrainingInfo.model_validate_json(data)
        for training_id, data in trainings_data.items()
    }
    
    print(f"✅ Loaded {len(trainings_info)} trainings")
    print("\n" + "="*50)

✅ Loaded 497 trainings



# Skill domain classification loop

In [12]:
MAX_LOOPS = 2
cache_period = 5

# training_data_version = 'test'

#####
# Load final training file
#####
filename = f"extended_trainings_{training_data_version}.json"
extended_trainings_save_path = DATA_TRAININGS_DIR / filename
if not extended_trainings_save_path.exists():
    save_json(extended_trainings_save_path, {})
ext_trainings_data = read_json(extended_trainings_save_path)

print("START CLASSIFICATION LOOP")
for i in range(MAX_LOOPS):
    print("ITERATE CLASSIFICATION LOOP")

    #####
    # Collect Extended Trainings with skill domain = UNKNOWN
    #####

    # print(ext_trainings_data)
    filt_trainings_ids = []
    for training_id in trainings_data:
        if training_id not in ext_trainings_data:
            filt_trainings_ids.append(training_id)
            continue

        ext_training_data = json.loads(ext_trainings_data[training_id])
        if ext_training_data['skill_domain'] not in skills_domains_data['domains']:
            print(f"Skill domain unknwown : {ext_training_data['skill_domain']} for {training_id}")
            filt_trainings_ids.append(training_id)

    if len(filt_trainings_ids) == 0:
        print("All Skill domain knwown")
        break

    print(f"Number of trainings not classisfied : {len(filt_trainings_ids)}")

    #####
    # Build filtered training list
    #####
    #print(trainings_data)
    filt_trainings_data = {}
    for training_id in filt_trainings_ids:
        filt_trainings_data[training_id] = trainings_info[training_id]

    # trainings_info = {
    #     training_id: TrainingInfo.model_validate_json(data)
    #     for training_id, data in filt_trainings_data.items()
    # }

    #####
    # Start trainings classification
    #####
    print("Start trainings classification")
    iteration = 0
    for training_id in tqdm(filt_trainings_ids):
        #print(training_id)
        iteration = iteration + 1

        print_prompt = False
        # if iteration == 1:
        #     print_prompt = True

        result = extract_extended_training_info(
                filt_trainings_data[training_id].describe_short(),
                skills_domains=skills_domains.get_names_bullet_list(),
                print_prompt=print_prompt
            )
        skill_domain = str(result).strip()

        training_data = json.loads(trainings_data[training_id])
        # print(training_data)
        training_data["skill_domain"] = skill_domain
        ext_trainings_data[training_id] = json.dumps(training_data)

        if iteration % cache_period == 0:
            save_json(extended_trainings_save_path, ext_trainings_data)

        # if iteration == 10: break

save_json(extended_trainings_save_path, ext_trainings_data)

START CLASSIFICATION LOOP
ITERATE CLASSIFICATION LOOP
All Skill domain knwown


# Compute statistics

In [10]:
def compute_trainings_data_stastistics(trainings_data):
    # Extract infos from the dictionary
    skill_domain_types = []

    for training_id in trainings_data:
        data = json.loads(trainings_data[training_id])
        skill_domain_types.append(data['skill_domain'])
        if data['skill_domain'] not in skills_domains.domains:
            print(f"Skill domain issue with training id {training_id} : {data['skill_domain']}")

    type_counts = Counter(skill_domain_types)
    print("\n" + "="*50)
    print("\nskill_domain_types Counts:")
    for type_name, count in type_counts.most_common():
        print(f"{type_name}: {count}")
    print(f"\nTotal count: {sum(type_counts.values())}")

In [11]:
#print(ext_trainings_data)
print(extended_trainings_save_path)
compute_trainings_data_stastistics(ext_trainings_data)

../data_trainings/extended_trainings_v7.json


skill_domain_types Counts:
Equipment Maintenance: 37
Communication Skills: 34
Risk Assessment and Insurance: 28
Legal and Advisory: 28
Compliance and Ethics: 26
Maritime Operations: 25
Electrical Engineering: 24
Hospitality Management: 24
Accounting and Finance: 23
Logistics and Supply Chain: 21
Electronic Systems: 20
Manufacturing and Production: 19
Information Management: 19
Food Safety, Standards and Innovation: 18
Creative and Presentation Skills: 17
Data Analysis: 13
Graphic Design: 13
Information Technology: 11
Occupational Health and Safety: 10
Chemical and Material Science: 10
Emergency Response: 9
Maritime Safety and Operations: 8
Cybersecurity: 8
Environmental Management: 7
Business Administration: 6
Problem Solving and Analysis: 6
Event Management: 6
Digital Marketing: 6
Technical Writing and Documentation: 5
Leadership and Management: 4
Operations and Logistics: 4
Automotive Maintenance: 3
Organizational Skills: 2
Professional 

In [35]:
#####
# Patch training in error
#####

training_id = 'tr452'
if True:
    result = extract_extended_training_info(
                trainings_info[training_id].summarize(),
                skills_domains = skills_domains.get_names_bullet_list(),
                model = "mistral-medium-latest",
                print_prompt=True
            )
    skill_domain = str(result).strip()
    print(f"skill_domain: {skill_domain}")
    ext_trainings_data[training_id]['skill_domain'] = skill_domain


You are an expert in skill taxonomy, training analysis and job description analysis.

From this training skills information:
Advanced Art History Appreciation Course / Art History Appreciation, Artistic Movements, Art Analysis, Cultural Influences, Artistic Heritage, Artistic Techniques, Historical Significance, Cultural Impact / Art Analysis

Extract the **skill domain** as much as possible coming from the following skill domains list:
- Accounting and Finance
- Agricultural Science
- Artificial Intelligence
- Automotive Maintenance
- Biotechnology
- Business Administration
- Chemical and Material Science
- Civil Engineering
- Cloud Computing
- Communication Skills
- Compliance and Ethics
- Construction Safety
- Creative and Presentation Skills
- Cybersecurity
- Data Analysis
- Digital Marketing
- Early Childhood Education
- Electrical Engineering
- Electronic Systems
- Emergency Response
- Environmental Management
- Equipment Maintenance
- Event Management
- Food Safety, Standards a

In [36]:
save_json(extended_trainings_save_path, ext_trainings_data)

In [None]:
filename = f"skill_domains_{skill_domains_version}.json"
save_path = DATA_SKILLS_DOMAINS_DIR / filename
save_json(save_path, skills_domains.model_dump())

In [28]:
trainings_data_save = {}
for training_id in ext_trainings_data:
    trainings_data_save[training_id] = json.dumps(ext_trainings_data[training_id].model_dump())
filename = f"extended_trainings_{training_data_version}.json"
extended_trainings_save_path = DATA_TRAININGS_DIR / filename
save_json(extended_trainings_save_path, trainings_data_save)

In [29]:
def compute_trainings_data_stastistics(trainings_data):
    # Extract infos from the dictionary
    skill_domain_types = []

    for training_data in trainings_data.values():
        data = json.loads(training_data)
        skill_domain_types.append(data['skill_domain'])

    type_counts = Counter(skill_domain_types)
    print("\n" + "="*50)
    print("\nskill_domain_types Counts:")
    for type_name, count in type_counts.most_common():
        print(f"{type_name}: {count}")
    print(f"\nTotal count: {sum(type_counts.values())}")

In [30]:
# Load training data
filename = f"extended_trainings_{training_data_version}.json"
trainings_save_path = DATA_TRAININGS_DIR / filename
trainings_data = read_json(trainings_save_path)

print(f"✅ Loaded {len(trainings_info)} trainings")
print("\n" + "="*50)

✅ Loaded 497 trainings



In [None]:
compute_trainings_data_stastistics(trainings_data)

In [7]:
def extend_skills_domains_info(
    trainings_data,
    skill_domains,
    model: str = "mistral-medium-latest",
    print_prompt = False
) -> SkillDomainInfo:
    """Extract skills domains using Skills Domains Extraction Agent"""

    prompt = SKILL_DOMAIN_EXTENSION_PROMPT.format(
        skills=trainings_data,
        skill_domains=skill_domains
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=SkillDomainInfo, prompt=prompt)

    if hasattr(extraction_agent, 'last_response'):
        track_api_call(extraction_agent.last_response, model)

    return result

In [8]:
if False:
    skills_domains = SkillDomainInfo(domains={})

    training_list_str = ""
    
    for training_id in ext_trainings_data:
        training_list_str += f"- {ext_trainings_data[training_id].summarize()}\n"

    new_skills_domains = extend_skills_domains_info(
        training_list_str,
        skills_domains,
        print_prompt=True
    )

    print(new_skills_domains)

In [52]:
def extract_extended_training_info(
    training_data,
    model: str = "mistral-small-latest",
    skills_domains: SkillDomainInfo = None,
    print_prompt: bool = False
) -> str:
    """Extract training info from file using Training Extraction Agent"""

    prompt = EXTENDED_TRAINING_EXTRACTION_PROMPT.format(
        skills_domains=skills_domains.describe(),
        training_description=training_data
    )

    if print_prompt is True:
        print(prompt)

    #return None
    
    extraction_agent = get_agent(model_id=model)
    result = extraction_agent(prompt=prompt)

    if hasattr(extraction_agent, 'last_response'):
        track_api_call(extraction_agent.last_response, model)

    return result

In [53]:
skills_domains = SkillDomainInfo(domains={})

In [54]:
if False:
    # Load skills domains data
    filename = f"skill_domains_{skill_domains_version}.json"
    save_path = DATA_SKILLS_DOMAINS_DIR / filename
    skills_domains_data = read_json(save_path)
    skills_domains = SkillDomainInfo(**skills_domains_data)
    
    print(f"✅ Loaded {len(skills_domains.domains)} skills domains")
    print("\n" + "="*50)

    #####
    # Collect Extended Trainings with skill domain = UNKNOWN
    #####
    print("ITERATE CLASSIFICATION LOOP")
    filt_trainings_ids = []
    for training_id in ext_trainings_data:
        ext_training_data = ext_trainings_data[training_id]
        #print(ext_training_data)
        if 'UNKNWON' in ext_training_data.skill_domain:
            filt_trainings_ids.append(training_id)
            #filt_trainings_data[training_id] = ext_trainings_data[training_id]

    # print(filt_trainings_ids)

    #####
    # Build filtered training list
    #####
    #print(trainings_data)
    filt_trainings_data = {}
    for training_id in filt_trainings_ids:
        filt_trainings_data[training_id] = trainings_info[training_id]

    
    #####
    # Start trainings classification
    #####
    print("Start trainings classification")
    iteration = 0
    for training_id in tqdm(filt_trainings_ids):
        #print(training_id)
        iteration = iteration + 1
        
        print_prompt = False
        #if iteration == 1:
        #    print_prompt = True
            
        result = extract_extended_training_info(
            filt_trainings_data[training_id].summarize(),
            skills_domains=skills_domains,
            print_prompt=print_prompt
        )
        skill_domain = str(result).strip()
        #print(skill_domain)

        if skill_domain != 'UNKNOWN':
            ext_training_info = ext_trainings_data[training_id]
            ext_training_info.skill_domain = skill_domain
            #ext_training_info_json = json.dumps(ext_training_info.model_dump())
            ext_trainings_data[training_id] = ext_training_info

        # if iteration == 5: break
# print(ext_trainings_data)


    trainings_data_save = {}
    for training_id in ext_trainings_data:
        trainings_data_save[training_id] = json.dumps(ext_trainings_data[training_id].model_dump())
    filename = f"extended_trainings_{training_data_version}.json"
    extended_trainings_save_path = DATA_TRAININGS_DIR / filename
    save_json(extended_trainings_save_path, trainings_data_save)

In [25]:
print("Start trainings classification")
cache_period = 5

filename = f"extended_trainings_{training_data_version}.json"
extended_trainings_save_path = DATA_TRAININGS_DIR / filename

if not extended_trainings_save_path.exists():
    save_json(extended_trainings_save_path, {})

ext_trainings_data = read_json(extended_trainings_save_path)

iteration = 0
for training_id in tqdm(trainings_info):
    if training_id not in ext_trainings_data:
        #print(training_id)
        iteration = iteration + 1
    
        print_prompt = False
        #if iteration == 1:
        #    print_prompt = True
    
        result = extract_extended_training_info(
                trainings_info[training_id].summarize(),
                skills_domains = skills_domains.get_names_bullet_list(),
                print_prompt=print_prompt
            )
        skill_domain = str(result).strip()
    
        training_data = json.loads(trainings_data[training_id])
        # print(training_data)
        training_data["skill_domain"] = skill_domain
        ext_trainings_data[training_id] = json.dumps(training_data)
    
        if iteration % cache_period == 0:
            save_json(extended_trainings_save_path, ext_trainings_data)
    
save_json(extended_trainings_save_path, ext_trainings_data)

Start trainings classification


100%|██████████| 497/497 [03:23<00:00,  2.44it/s]


In [55]:
MAX_LOOPS = 6

print("START CLASSIFICATION LOOP")
for i in range(MAX_LOOPS):
    #####
    # Collect Extended Trainings with skill domain = UNKNOWN
    #####
    print("ITERATE CLASSIFICATION LOOP")
    # print(ext_trainings_data)
    filt_trainings_ids = []
    for training_id in ext_trainings_data:
        ext_training_data = ext_trainings_data[training_id]
        if 'UNKNWON' in ext_training_data.skill_domain:
            filt_trainings_ids.append(training_id)

    if len(filt_trainings_ids) == 0:
        break

    print(f"Number of trainings not classisfied : {len(filt_trainings_ids)}")

    #####
    # Build filtered training list
    #####
    #print(trainings_data)
    filt_trainings_data = {}
    for training_id in filt_trainings_ids:
        filt_trainings_data[training_id] = trainings_info[training_id]

    print(len(filt_trainings_data))

    # trainings_info = {
    #     training_id: TrainingInfo.model_validate_json(data)
    #     for training_id, data in filt_trainings_data.items()
    # }

    #####
    # Generate skill domains list
    #####
    print("Generate skill domains list")
    training_list_str = ""
    for training_id in ext_trainings_data:
        training_list_str += f"- {ext_trainings_data[training_id].summarize()}\n"

    new_skills_domains = extend_skills_domains_info(
        training_list_str,
        skills_domains,
        print_prompt=False
    )
    print("new_skills_domains:")
    print(new_skills_domains)

    skills_domains.domains.update(new_skills_domains.domains)
    #print(skills_domains)

    #####
    # Start trainings classification
    #####
    print("Start trainings classification")
    iteration = 0
    for training_id in tqdm(filt_trainings_ids):
        #print(training_id)
        iteration = iteration + 1

        print_prompt = False
        if iteration == 1:
            print_prompt = True

        result = extract_extended_training_info(
            filt_trainings_data[training_id].summarize(),
            skills_domains=skills_domains,
            print_prompt=print_prompt
        )
        skill_domain = str(result).strip()
        #print(skill_domain)

        if skill_domain != 'UNKNOWN':
            ext_training_info = ext_trainings_data[training_id]
            ext_training_info.skill_domain = skill_domain
            #ext_training_info_json = json.dumps(ext_training_info.model_dump())
            ext_trainings_data[training_id] = ext_training_info
        #if iteration == 1: break
              
    #filt_trainings_data = {}
# print('-----ext_trainings_data-----')
# print(ext_trainings_data)
# print('-----skills_domains-----')
# print(skills_domains)

START CLASSIFICATION LOOP
ITERATE CLASSIFICATION LOOP
Number of trainings not classisfied : 497
497
Generate skill domains list


KeyboardInterrupt: 

In [19]:
filename = f"skill_domains_{skill_domains_version}.json"
save_path = DATA_SKILLS_DOMAINS_DIR / filename
save_json(save_path, skills_domains.model_dump())

In [13]:
def compute_trainings_data_stastistics(trainings_data):
    # Extract infos from the dictionary
    skill_domain_types = []

    for training_data in trainings_data.values():
        data = json.loads(training_data)
        skill_domain_types.append(data['skill_domain'])

    type_counts = Counter(skill_domain_types)
    print("\n" + "="*50)
    print("\nskill_domain_types Counts:")
    for type_name, count in type_counts.most_common():
        print(f"{type_name}: {count}")
    print(f"\nTotal count: {sum(type_counts.values())}")

In [14]:
# Load training data
filename = f"extended_trainings_{training_data_version}.json"
trainings_save_path = DATA_TRAININGS_DIR / filename
trainings_data = read_json(trainings_save_path)

print(f"✅ Loaded {len(trainings_info)} trainings")
print("\n" + "="*50)

✅ Loaded 497 trainings



In [15]:
compute_trainings_data_stastistics(trainings_data)



skill_domain_types Counts:
Risk and Compliance: 47
Maritime Operations: 37
Hotel and Hospitality: 34
Procurement and Supply Chain: 29
Visual Arts: 29
Legal and Advisory: 28
Equipment Maintenance: 27
Electronics: 21
Tourism and Hospitality: 20
Manufacturing and Production: 18
Information Management: 17
Financial Management: 16
Electrical Engineering: 15
Maritime Safety and Operations: 14
Event Management: 12
Food Safety and Standards: 11
Data Analysis: 10
Environmental Management: 10
Negotiation and Contract Management: 10
Technical and Mechanical Skills: 10
Technical Documentation: 9
Professional and Interpersonal Skills: 8
Safety and Compliance: 7
Chemical and Material Science: 7
Product Development and Innovation: 7
Problem Solving and Analysis: 7
Information Technology: 7
Process and Quality Management: 6
Sales and Marketing: 4
Administrative and Management Skills: 4
Automotive Maintenance: 4
Operations and Logistics: 3
Creative and Presentation Skills: 3
Strategic and Operational