In [1]:
!pip install strands-agents[mistral] python-dotenv



In [112]:
import json
import os
import sys
import dotenv
import boto3
import requests
from collections import Counter
import yaml
import copy

from datetime import datetime

import pprint

import yaml
from pathlib import Path
from typing import Dict, List, Optional, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

# AWS authentication
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
    chat_with_persona,
    validate_submission_format,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract
)

from src.models.persona_info import PersonaInfo
from src.models.job_info import JobInfo
from src.models.training_info import TrainingInfo
from src.models.generic_models import (
    BooleanModel,
    BooleanModelWithRationale,
    ListOfIds
)

from src.prompts.find_training_matches_prompt import (
    FIND_TRAINING_MATCHES_PROMPT,
    FIND_TRAINING_MATCHES_PROMPT_BY_NAME,
    CHECK_PERSONA_TRAINING_MATCH,
    FIND_TRAINING_MATCHES_FOR_JOB_PROMPT
)

from src.prompts.find_job_matches_prompt import (
    FIND_JOB_MATCHES_PROMPT
)

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("‚ùå No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("‚úÖ API key found, we're ready to roll")

‚úÖ API key found, we're ready to roll


In [113]:
DATA_PERSONAS_INFO_DIR = Path('../data_personas_info')
DATA_JOBS_DIR = Path('../data_jobs')
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_SKILLS_DOMAINS_DIR = Path('../data_skills_domains')
DATA_MATCH_JOBS_TRAININGS_DIR = Path('../data_match_jobs_trainings')

In [114]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

personas_info_data_version = config["personas_info_data_version"]
print(f"personas_info_data_version version: {personas_info_data_version}")

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

skill_domains_version = config["skill_domains_version"]
print(f"skill_domains_version version: {skill_domains_version}")

match_skills_domains_trainings_data_version = f"{skill_domains_version}_{training_data_version}"
match_jobs_trainings_data_version = f"{job_data_version}_{training_data_version}"

personas_info_data_version version: v14
job_data_version version: v4
training_data_version version: v7
skill_domains_version version: v3


In [115]:
SUBMISSION_DIR = Path('../submissions')

In [116]:
# Load Jobs data
filename = f"skill_domain_classified_jobs_{job_data_version}.json"
jobs_save_path = DATA_JOBS_DIR / filename

jobs_data = read_json(jobs_save_path)

# Convert to JobInfo objects
jobs_info = {
    job_id: JobInfo.model_validate_json(data)
    for job_id, data in jobs_data.items()
}

print(f"‚úÖ Loaded {len(jobs_info)} jobs")
print("\n" + "="*50)

‚úÖ Loaded 200 jobs



In [117]:
# Load Trainings data
filename = f"final_trainings_{training_data_version}.json"
trainings_save_path = DATA_TRAININGS_DIR / filename

trainings_data = read_json(trainings_save_path)

# Convert to TrainingInfo objects
trainings_info = {
    training_id: TrainingInfo.model_validate_json(data)
    for training_id, data in trainings_data.items()
}

print(f"‚úÖ Loaded {len(trainings_info)} trainings")
print("\n" + "="*50)

‚úÖ Loaded 497 trainings



In [118]:
# Load Personas data
filename = f"last_final_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename

personas_data = read_json(personas_save_path)

# Convert to PersonaInfo objects
personas = {
    pid: PersonaInfo.model_validate_json(data)
    for pid, data in personas_data.items()
}

print(f"‚úÖ Loaded {len(personas)} personas")
print("\n" + "="*50)

‚úÖ Loaded 100 personas



In [119]:
# Load Match jobs trainings data
filename = f"match_jobs_trainings_{match_jobs_trainings_data_version}.json"
save_path = DATA_MATCH_JOBS_TRAININGS_DIR / filename
jobs_trainings_map = read_json(save_path)

# print(jobs_trainings_map)

In [120]:
filename = f"final_map_clusters_trainings_{training_data_version}.json"
save_path = DATA_TRAININGS_DIR / filename
trainings_map = read_json(save_path)
trainings_map_lower = {key.lower(): value for key, value in trainings_map.items()}

#print(trainings_map_lower)

# Process submission file

In [121]:
def get_skill_level_number(level_str):
    skill_str_to_number = {
        "None": 0,
        "Basic": 1,
        "Intermediate": 2,
        "Advanced": 3
    }
    return skill_str_to_number[level_str]

In [122]:
def matching_agent_process_jobs(persona_id, persona_data, verbose=False):
    persona_info = PersonaInfo(**persona_data)
    persona_skills_list = persona_info.skills
    
    ###
    # Create result structure
    ###
    data_jobs = []

    ###
    # Process trainings list
    ###
    selected_jobs_ids = persona_data['proposed_job_ids']
    if 'selected_job_ids' in persona_data:
        selected_jobs_ids = persona_data['selected_job_ids']

    if len(selected_jobs_ids) > 0:
        for job_id in selected_jobs_ids:
            if verbose is True:
                print(f"--- job_id {job_id}---")

            selected_trainings_ids = []
            job_skills = ''
            for domain_skill, job_skill_level_str in jobs_trainings_map[job_id].items():
                job_skills += f"- {domain_skill} : {job_skill_level_str}" + "\n"
                # print(f"- {domain_skill} : {job_skill_level_str}")
                job_skill_level = get_skill_level_number(job_skill_level_str)
                # print(f"job_skill_level : {job_skill_level}")

                if domain_skill in persona_skills_list:
                    persona_skill_level = get_skill_level_number(persona_skills_list[domain_skill])
                else:
                    persona_skill_level = 0
                # print(f"persona_skill_level : {persona_skill_level}")

                if persona_skill_level < job_skill_level:
                    parts = domain_skill.split(" / ")
                    domain = parts[0]
                    skill = parts[1]
                    for training_id in trainings_map[domain][skill]:
                        training_level = get_skill_level_number(trainings_info[training_id].level_acquired)
                        # print(f"training_level : {training_level}")
                        if training_level <= job_skill_level and training_level > persona_skill_level:
                            selected_trainings_ids.append(training_id)

            data_job = {
                'job_id': job_id,
                'suggested_trainings': selected_trainings_ids
            }

            data_jobs.append(data_job)

    # print(f"data_job : {data_job}")

    return data_jobs

In [124]:
def matching_agent_process_trainings(persona_id, persona_data, verbose=False):
    selected_trainings_ids = []
    domain_skills = persona_data['skills']
    for domain_skill in domain_skills:
        parts = domain_skill.split(" : ")
        if len(parts) < 2:
            print(f"ERROR {persona_id}: domain_skill bad formated")
            continue

        domain = parts[0]
        skill = parts[1]

        #if domain not in trainings_map:
        if domain.lower() not in trainings_map_lower:
            print(f"ERROR {persona_id}: domain unkown {domain}")
            continue

        if skill not in trainings_map_lower[domain.lower()]:
        #if skill not in trainings_map[domain]:
            print(f"ERROR {persona_id}: skill unkown {skill}")
            continue

        persona_level_str = persona_data['skills'][domain_skill]
        level = get_skill_level_number(persona_level_str)

        if verbose is True:
            print(f"domain_skill : {domain_skill}")
            print(f"persona level : {persona_level_str} - {level}")

        for training_id in trainings_map_lower[domain.lower()][skill]:
            training_level_str = trainings_info[training_id].level_acquired
            training_level = get_skill_level_number(training_level_str)
            if verbose is True:
                print("---")
                print(f"training_id : {training_id}")
                print(f"training_level_str : {training_level_str}")
                print(f"training_level : {training_level}")
            if training_level == level + 1:
                selected_trainings_ids.append(training_id)

    return selected_trainings_ids
    

In [126]:
def matching_agent_process(
    persona_id,
    persona_info,
    verbose=False
):

    persona_data = json.loads(personas_data[persona_id])

    if verbose:
        print(persona_info)

    data = {'persona_id': persona_id}
    personas_matched_status = False

    age = persona_info.age
    if verbose: print(f"age : {age}")

    if age < 16:
        # Minor - needs awareness type
        data['predicted_type'] = 'awareness'
        data['predicted_items'] = 'too_young'
        personas_matched_status = True
        if verbose: print("persona is minor => awareness too_young")
    else:
        data['predicted_type'] = 'jobs+trainings'
        if persona_info.recommendation_type == "trainings_only":
            data['predicted_type'] = 'trainings_only'
            personas_matched_status = True
            # personas_matched_status = True because threated later
        elif persona_info.recommendation_type == "awareness":
            data['predicted_type'] = 'awareness'
            data['predicted_items'] = 'info'
            personas_matched_status = True

        if data['predicted_type'] == 'jobs+trainings':
            data_jobs = matching_agent_process_jobs(persona_id, persona_data, verbose=verbose)

            # remove if put back
            data['jobs'] = data_jobs
            personas_matched_status = True
            
            # if len(data_jobs) > 0:
            #     personas_matched_status = True
            #     data['jobs'] = data_jobs
            # else:
            #     data['predicted_type'] = 'trainings_only'

        # print(data)

        if data['predicted_type'] == 'trainings_only':
            selected_trainings_ids = matching_agent_process_trainings(persona_id, persona_data, verbose=verbose)
            data['trainings'] = selected_trainings_ids
            personas_matched_status = True
    return personas_matched_status, data

In [None]:
submission_version = "v29"
cache_period = 5

# filename = f"results_{timestamp}.json"
filename = f"results_{submission_version}.json"
save_path = SUBMISSION_DIR / filename

if not save_path.exists():
    save_json(save_path, [])

results = read_json(save_path)

list_of_persona_done = []
for result in results:
    list_of_persona_done.append(result['persona_id'])

personas_matched = 0
new_items_processed = 0

for persona_id, persona_info in tqdm(personas.items(), desc="Generating recommendations"):
    if persona_id not in list_of_persona_done:
        #print("processing : " + persona_id)
        #print(type(personas[persona_id]))
        #print(personas[persona_id])

        new_items_processed = new_items_processed + 1
        personas_matched_status, data = matching_agent_process(
            persona_id,
            personas[persona_id],
            verbose=False
        )

        if personas_matched_status is True:
            personas_matched += 1

            results.append(data)

            if personas_matched % cache_period == 0:
                save_json(save_path, results)

    # if new_items_processed > 1:break

# Save results
save_json(save_path, results)
print(f"\n‚úÖ Generated recommendations for {len(results)} personas")
print(f"üìÅ Results saved to: {save_path}")

# Count types for debugging
type_counts = {}
for r in results:
    t = r.get('predicted_type', 'unknown')
    type_counts[t] = type_counts.get(t, 0) + 1
print(f"\nüìä Type distribution: {type_counts}")

# Final cost summary for matching
print("\nüìä Final matching costs:")
print_cost_summary()
print("\n" + "="*50)

---

# For Debug Only

---

In [123]:
if False:
    persona_id = 'persona_010'
    persona_data = json.loads(personas_data[persona_id])
    data_jobs = matching_agent_process_jobs(persona_id, persona_data, verbose=True)
    print(data_jobs)

In [125]:
if False:
    persona_id = 'persona_022'
    persona_data = json.loads(personas_data[persona_id])
    selected_trainings_ids = matching_agent_process_trainings(persona_id, persona_data, verbose=True)
    print(selected_trainings_ids)
    
    # print(jobs_trainings_map)

In [127]:
if False:
    persona_id = 'persona_088'
    personas_matched_status, data = matching_agent_process(
        persona_id,
        personas[persona_id],
        verbose=True
    )
    print(personas_matched_status)
    print(data)

In [130]:
filename = f"results_{submission_version}.json"
save_path = SUBMISSION_DIR / filename
initial_submission = read_json(save_path)

new_submission = copy.deepcopy(initial_submission)
for r in new_submission:
    if r['predicted_type'] == 'jobs+trainings':
        r['jobs'] = []

for r in new_submission:
    if r['predicted_type'] == 'trainings_only':
        r['trainings'] = []

filename = f"results_{submission_version}_1.json"
save_path = SUBMISSION_DIR / filename
save_json(save_path, new_submission)

In [113]:
filename = f"results_{submission_version}.json"
save_path = SUBMISSION_DIR / filename
results = read_json(save_path)

try:
    validate_submission_format(results)
    print("‚úÖ Format is valid! Ready to submit")
except ValueError as e:
    print(f"‚ùå Format error: {e}")
    print("Fix this before submitting!")

‚úÖ Validated 100 results - format is correct!
‚úÖ Format is valid! Ready to submit


In [113]:
# print(results)

predicted_types = []
awareness_types = []
for result in results:
    predicted_type = result["predicted_type"]
    predicted_types.append(predicted_type)
    if predicted_type == "awareness":
        awareness_types.append(result["predicted_items"])

type_counts = Counter(predicted_types)
print("\npredicted_types Counts:")
for type_name, count in type_counts.most_common():
    print(f"{type_name}: {count}")
print(f"\nTotal count: {sum(type_counts.values())}")

type_counts = Counter(awareness_types)
print("\nawareness_types Counts:")
for type_name, count in type_counts.most_common():
    print(f"{type_name}: {count}")
print(f"\nTotal count: {sum(type_counts.values())}")



predicted_types Counts:
trainings_only: 46
jobs+trainings: 34
awareness: 20

Total count: 100

awareness_types Counts:
too_young: 11
info: 9

Total count: 20
