In [None]:
# Test the new embedding tool

from embedding_tool import KnowledgeBaseEmbedder

# Create embedder instance
embedder = KnowledgeBaseEmbedder()

# Create embeddings and get statistics
result = embedder.create_embeddings()

# Display results
print(f"Status: {result['status']}")
print(f"Message: {result['message']}")
print(f"Documents loaded: {result['documents_loaded']}")
print(f"Chunks created: {result['chunks_created']}")
print(f"Vectors stored: {result['vectors_stored']}")
print(f"Vector dimensions: {result['vector_dimensions']}")
print(f"Document types found: {', '.join(result['doc_types'])}")

In [None]:
# Test retrieval from the new vectorstore

vectorstore = embedder.get_vectorstore()

if vectorstore:
    # Test a sample query
    query = "software development experience"
    results = vectorstore.similarity_search(query, k=3)
    
    print(f"Query: '{query}'")
    print(f"Found {len(results)} relevant documents:")
    print()
    
    for i, doc in enumerate(results, 1):
        print(f"Result {i}:")
        print(f"  Doc type: {doc.metadata.get('doc_type', 'unknown')}")
        print(f"  Source: {doc.metadata.get('source', 'unknown')}")
        print(f"  Content preview: {doc.page_content[:200]}...")
        print()
else:
    print("No vectorstore found")

In [None]:
# Test the web scraper tool

from cv_agents.tools.web_scraper import job_posting_scraper
import json
from datetime import datetime

# Test URL from implementation plan
test_url = "https://app.welcometothejungle.com/dashboard/jobs/oA1SArxV"

print(f"Testing web scraper with URL: {test_url}")
print("=" * 60)

try:
    # Extract job posting
    job_posting = job_posting_scraper._run(test_url)
    
    print("✅ Successfully extracted job posting!")
    print()
    print(f"Title: {job_posting.title}")
    print(f"Company: {job_posting.company}")
    print(f"Experience Level: {job_posting.experience_level}")
    print(f"Industry: {job_posting.industry}")
    print()
    print(f"Skills ({len(job_posting.skills)}): {', '.join(job_posting.skills)}")
    print()
    print(f"Requirements ({len(job_posting.requirements)}):")
    for i, req in enumerate(job_posting.requirements, 1):
        print(f"  {i}. {req}")
    print()
    print(f"Description (first 300 chars): {job_posting.description[:300]}...")
    
except Exception as e:
    print(f"❌ Error: {e}")
    job_posting = None

In [None]:
# Save job posting to persistence directory

if job_posting:
    # Create filename with timestamp and sanitized company/title
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_company = "".join(c for c in job_posting.company if c.isalnum() or c in (' ', '-', '_')).rstrip()
    safe_title = "".join(c for c in job_posting.title if c.isalnum() or c in (' ', '-', '_')).rstrip()
    
    filename = f"{timestamp}_{safe_company}_{safe_title}.json".replace(" ", "_")
    filepath = f"job_postings/{filename}"
    
    # Convert to dict and save as JSON
    job_data = job_posting.model_dump()
    job_data["scraped_at"] = datetime.now().isoformat()
    job_data["source_url"] = test_url
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(job_data, f, indent=2, ensure_ascii=False)
    
    print(f"💾 Job posting saved to: {filepath}")
    print(f"📄 File size: {len(json.dumps(job_data, indent=2))} bytes")
    
    # Show the JSON structure
    print()
    print("📋 Saved data structure:")
    for key, value in job_data.items():
        if isinstance(value, list):
            print(f"  {key}: [{len(value)} items]")
        elif isinstance(value, str) and len(value) > 50:
            print(f"  {key}: '{value[:50]}...'")
        else:
            print(f"  {key}: {value}")
else:
    print("❌ No job posting data to save")