In [1]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.12.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.17.0-py3-none-any.whl.metadata (80 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.58b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [20]:
import json
import os
import sys
import dotenv

import pprint

import yaml
from pathlib import Path, PosixPath
from typing import Dict, List, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

from collections import Counter

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    compute_stat_for_multi_items
)

from src.models.skill_domain_info import SkillDomainInfo
from src.models.training_info import (
    TrainingInfo
)

from src.models.skill_info import(
    SkillLabelingReply,
    SkillDomainLabelingReply
)

from src.prompts.training_extraction_prompt import(
    SKILL_LABELING_PROMPT,
    SKILL_DOMAIN_LABELING_PROMPT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("❌ No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("✅ API key found, we're ready to roll")

✅ API key found, we're ready to roll


In [21]:
DATA_TRAININGS_DIR = Path('../data_trainings')

In [22]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

training_data_version version: v7


In [23]:
# Load training data
filename = f"clusterized_trainings_{training_data_version}.json"
trainings_save_path = DATA_TRAININGS_DIR / filename
trainings_data = read_json(trainings_save_path)

# Convert to TrainingInfo objects
trainings_info = {
    training_id: TrainingInfo.model_validate_json(data)
    for training_id, data in trainings_data.items()
}

print(f"✅ Loaded {len(trainings_info)} trainings")
print("\n" + "="*50)

✅ Loaded 497 trainings



In [24]:
filename = f"final_map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)

In [25]:
for domain in trainings_map:
    for skill in trainings_map[domain]:
        for id in trainings_map[domain][skill]:
            training_data_dict = json.loads(trainings_data[id])
            training_data_dict['skill_domain'] = domain
            training_data_dict['skill_acquired'] = skill
            training_data = json.dumps(training_data_dict, ensure_ascii=False)
            trainings_data[id] = training_data

filename = f"final_trainings_{training_data_version}.json"
trainings_save_path = DATA_TRAININGS_DIR / filename
save_json(trainings_save_path, trainings_data)