In [1]:
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.13.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.18.0-py3-none-any.whl.metadata (80 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [4]:
import json
import os
import sys
import dotenv
import boto3
import requests
from collections import Counter

import pprint

import yaml
from pathlib import Path
from typing import Dict, List, Optional, Tuple, TypeVar
from tqdm import tqdm

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

# AWS authentication
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest

sys.path.append('..')

from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

from src.my_utils import (
    display_markdown_file,
    call_mistral,
    get_agent,
    batch_extract,
    send_message_to_chat
)

from src.models.interview_info import(
    InterviewInfo,
    InverviewQualityInfo
)
from src.prompts.interview_prompt import(
    RECOMMANDATION_CONSOLIDATION_PROMPT
)

from src.prompts.persona_extraction_prompt import(
    PERSONA_INTEREST_EXTRACTION_PROMPT
)

from src.models.persona_info import PersonaInfo, InitialPersonaInfo
from src.models.job_info import JobInfo

# Load API key from .env file
dotenv.load_dotenv("../env")

# Check if we're good to go
if not os.getenv("MISTRAL_API_KEY"):
    print("❌ No MISTRAL_API_KEY found!")
    print("Create an env file with your API key")
else:
    print("✅ API key found, we're ready to roll")

✅ API key found, we're ready to roll


In [5]:
MAX_TURNS_IN_INTERVIEW = 10

In [6]:
DATA_PERSONAS_INFO_DIR = Path('../data_personas_info')
DATA_JOBS_DIR = Path('../data_jobs')
DATA_TRAININGS_DIR = Path('../data_trainings')
DATA_INTERVIEWS_DIR = Path('../data_interviews')
DATA_MATCH_JOBS_TRAININGS_DIR = Path('../data_match_jobs_trainings')

In [7]:
with open("../src/config.yaml", "r") as f:
    config = yaml.safe_load(f)

personas_info_data_version = config["personas_info_data_version"]
print(f"personas_info_data_version version: {personas_info_data_version}")

job_data_version = config["job_data_version"]
print(f"job_data_version version: {job_data_version}")

training_data_version = config["training_data_version"]
print(f"training_data_version version: {training_data_version}")

interview_data_version = config["interview_data_version"]
print(f"interview_data_version version: {interview_data_version}")

match_jobs_trainings_data_version = f"{job_data_version}_{training_data_version}"

personas_info_data_version version: v14
job_data_version version: v4
training_data_version version: v7
interview_data_version version: v8


In [8]:
# Load interviews
filename = f"job_extension_interviews_{interview_data_version}.json"
interviews_save_path = DATA_INTERVIEWS_DIR / filename
interviews = read_json(interviews_save_path)

In [9]:
# Load Personas data
filename = f"aging_filter_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename

initial_personas_data = read_json(personas_save_path)

# Convert to PersonaInfo objects
personas = {
    pid: PersonaInfo.model_validate_json(data)
    for pid, data in initial_personas_data.items()
}

print(f"✅ Loaded {len(personas)} personas")
print("\n" + "="*50)

✅ Loaded 100 personas



# Parse interviews

In [10]:
def extract_recommendation_type(
    conversation: List[str],
    model: str = "mistral-small-latest",
    print_prompt=False
) -> InitialPersonaInfo:
    """Extract persona info from conversation using Persona Extraction Agent"""

    text = '\n'.join(conversation)

    prompt = PERSONA_INTEREST_EXTRACTION_PROMPT.format(
        conversation=text
    )

    if print_prompt is True:
        print(prompt)

    extraction_agent = get_agent(model_id=model, temperature=0.0)
    result = extraction_agent.structured_output(output_model=InitialPersonaInfo, prompt=prompt)

    if print_prompt is True:
        print(result)
        
    persona_info = PersonaInfo()
    persona_info.name = result.name
    persona_info.location = result.location
    persona_info.goals = result.goals
    persona_info.education_level = result.education_level
    
    persona_info.recommendation_type = 'awareness'
    if result.interested_by_training is True:
        persona_info.recommendation_type = 'trainings_only'
    if result.interested_by_job is True:
        persona_info.recommendation_type = 'jobs_trainings'

    return persona_info

In [16]:
filename = f"recommandation_consolidated_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
if not personas_save_path.exists():
    save_json(personas_save_path, {})
personas_data = read_json(personas_save_path)

new_personas_processed = 0
for person_id in tqdm(initial_personas_data):
    if person_id in personas_data:
        continue

    if person_id not in interviews:
        personas_data[person_id] = initial_personas_data[person_id]
        continue

    result = extract_recommendation_type(
        interviews[person_id]['interview'],
        print_prompt=False
    )

    new_personas_processed += 1    
    persona_data_dict = json.loads(initial_personas_data[person_id])

    #if result.age != -1:
    #    print(f"{person_id} : age < 16")
    #    persona_data_dict['age'] = result.age
    #    personas_data[person_id] = json.dumps(persona_data_dict, ensure_ascii=False)        
    #    #print(persona_data_dict)
    #    save_json(personas_save_path, personas_data)

    recommendation_type = result.recommendation_type
    if recommendation_type in ['jobs_trainings', 'trainings_only', 'awareness']:

        if recommendation_type != persona_data_dict['recommendation_type']:
            print(f"{person_id} : {persona_data_dict['recommendation_type']} - {recommendation_type}")

        persona_data_dict['recommendation_type'] = recommendation_type
        personas_data[person_id] = json.dumps(persona_data_dict, ensure_ascii=False)

        save_json(personas_save_path, personas_data)

    # if new_personas_processed > 0: break

save_json(personas_save_path, personas_data)


  1%|          | 1/100 [00:01<02:06,  1.27s/it]

persona_001 : jobs_trainings - trainings_only


  7%|▋         | 7/100 [00:07<01:34,  1.01s/it]

persona_007 : jobs_trainings - awareness


 22%|██▏       | 22/100 [00:26<02:20,  1.80s/it]

persona_022 : jobs_trainings - trainings_only


 30%|███       | 30/100 [00:34<01:18,  1.13s/it]

persona_030 : jobs_trainings - trainings_only


 44%|████▍     | 44/100 [00:45<00:48,  1.17it/s]

persona_044 : jobs_trainings - trainings_only


 45%|████▌     | 45/100 [00:47<01:00,  1.10s/it]

persona_045 : jobs_trainings - trainings_only


 49%|████▉     | 49/100 [00:50<00:50,  1.00it/s]

persona_049 : jobs_trainings - trainings_only


 86%|████████▌ | 86/100 [01:36<00:20,  1.45s/it]

persona_086 : jobs_trainings - trainings_only


100%|██████████| 100/100 [01:56<00:00,  1.17s/it]


---

# For Debug Only

---

# Redo Persona

In [12]:
person_id = 'persona_007'

filename = f"recommandation_consolidated_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
personas_data = read_json(personas_save_path)

if person_id not in interviews:
    personas_data[person_id] = initial_personas_data[person_id]
    save_json(personas_save_path, personas_data)
else:
    result = extract_recommendation_type(
        interviews[person_id]['interview'],
        print_prompt=True
    )

    #print(result)
    
    recommendation_type = result.recommendation_type

    persona_data_dict = json.loads(initial_personas_data[person_id])
    #if persona_data_dict['recommendation_type'] == 'jobs_trainings' and result.age != -1:
    #    persona_data_dict['age'] = result.age
    #    personas_data[person_id] = json.dumps(persona_data_dict, ensure_ascii=False)        
    #    print(persona_data_dict)
    #    save_json(personas_save_path, personas_data)
    #    if result.age < 16:
    #        recommendation_type = 'awareness'
        
    if recommendation_type in ['jobs_trainings', 'trainings_only', 'awareness']:
        if recommendation_type != persona_data_dict['recommendation_type']:
            print(f"{person_id} : {persona_data_dict['recommendation_type']} - {recommendation_type}")
    
        persona_data_dict['recommendation_type'] = recommendation_type
        personas_data[person_id] = json.dumps(persona_data_dict, ensure_ascii=False)
        print(persona_data_dict)
        save_json(personas_save_path, personas_data)


You are given :
- an interview conversation with a persona.

From the interview conversation, extract following information:
- Persona interest based on the following precise definitions:
    - Interested by job: 
        - False if user clearly mention is not interested by job
        - Do not set to False if user show interest but lack confidence
    - interested_by_training:
        - False if user clearly mention is not interested by training
        - Do not set to False if user show interest but lack confidence

SPECIAL CASES:
- If user reply in poetic way to answer, try to understand 'behind the line'

Return following fields :
- interested_by_job
- interested_by_training
- rationale = justification of your choice

Conversation:
Assistant: In a single sentence, which domains you are interested in to find a job from follwing list:
- Financial Operations And Compliance Management : Managing financial activities, ensuring regulatory compliance, and supporting risk management in fi

# Statistics

In [17]:
filename = f"recommandation_consolidated_personas_info_{personas_info_data_version}.json"
personas_save_path = DATA_PERSONAS_INFO_DIR / filename
persona_infos = read_json(personas_save_path)

In [18]:

# Extract infos from the dictionary
recommendation_types = []
for persona in persona_infos.values():
    data = json.loads(persona)
    recommendation_types.append(data['recommendation_type'])

type_counts = Counter(recommendation_types)
print("\nRecommendation Type Counts:")
for type_name, count in type_counts.most_common():
    print(f"{type_name}: {count}")
print(f"\nTotal count: {sum(type_counts.values())}")



Recommendation Type Counts:
jobs_trainings: 55
trainings_only: 25
awareness: 20

Total count: 100
