# Schema Testing - Resume & Job Data Extraction

This notebook demonstrates the improved schema functionality for extracting structured data from:
1. CV/Resume documents
2. Job postings from LinkedIn

Both parsing methods ensure that missing information returns string "None" instead of null values.


In [None]:
import os
import sys
from dotenv import load_dotenv, find_dotenv
from pprint import pprint
import json

# Load environment variables
_ = load_dotenv(find_dotenv())

# Import our schema and parsing functions
from schema import (
    ResumeSchema, JobSchema, CVParser,
    parse_resume_text, parse_resume_pdf
)
from google_cse_linkedin_search import search_linkedin_jobs_google


## 1. Testing Resume/CV Parsing

In [None]:
# Sample resume text for testing
sample_resume_text = """
John Doe
Senior Python Developer

Contact Information:
Email: john.doe@email.com
Phone: +1-555-0123
Location: San Francisco, CA
LinkedIn: https://linkedin.com/in/johndoe
GitHub: https://github.com/johndoe

Professional Summary:
Experienced Python developer with 8+ years of experience in web development, data science, and machine learning. 
Skilled in Django, Flask, PostgreSQL, and cloud technologies. Strong background in building scalable applications.

Work Experience:

Senior Python Developer | Tech Corp | Jan 2020 - Present
San Francisco, CA | Full-time
• Led development of microservices architecture using Django and PostgreSQL
• Improved application performance by 40% through code optimization
• Mentored junior developers and conducted code reviews
• Technologies: Python, Django, PostgreSQL, Redis, AWS, Docker

Python Developer | StartupXYZ | Jun 2018 - Dec 2019
Remote | Full-time
• Developed RESTful APIs using Flask and SQLAlchemy
• Implemented automated testing with pytest and CI/CD pipelines
• Collaborated with cross-functional teams in Agile environment
• Technologies: Python, Flask, SQLAlchemy, MySQL, Jenkins

Education:
Bachelor of Science in Computer Science
University of California, Berkeley | 2014-2018
GPA: 3.8/4.0
Relevant Coursework: Data Structures, Algorithms, Database Systems, Machine Learning

Skills:
Programming Languages: Python (Expert), JavaScript (Advanced), SQL (Advanced), Java (Intermediate)
Frameworks: Django, Flask, React, Node.js
Databases: PostgreSQL, MySQL, MongoDB, Redis
Cloud: AWS (EC2, S3, RDS), Docker, Kubernetes
Tools: Git, Jenkins, Jira, VS Code

Certifications:
• AWS Certified Solutions Architect - Associate (2021)
• Python Institute PCAP Certification (2019)

Projects:
E-commerce Platform (2020-2021)
• Built scalable e-commerce platform using Django and PostgreSQL
• Implemented payment processing with Stripe API
• Deployed on AWS with auto-scaling capabilities
• GitHub: https://github.com/johndoe/ecommerce-platform

Languages:
• English (Native)
• Spanish (Intermediate)

Expected Salary: $120,000 - $150,000
Availability: 2 weeks notice
Preferred Work Type: Remote or Hybrid
"""

print("Testing resume parsing...")
resume_result = parse_resume_text(sample_resume_text)

print(f"\nParsing Success: {resume_result['success']}")
if resume_result['success']:
    print(f"Parsing Method: {resume_result['parsing_method']}")
    print("\n=== PARSED RESUME DATA ===")
    pprint(resume_result['data'], width=120)
else:
    print(f"Error: {resume_result['error']}")


## 2. Testing 'None' String Handling

In [None]:
# Test with minimal resume to check 'None' handling
minimal_resume = """
Jane Smith
Email: jane@email.com
Python Developer with 3 years experience.
"""

print("Testing with minimal resume data...")
minimal_result = parse_resume_text(minimal_resume)

if minimal_result['success']:
    print("\n=== CHECKING 'None' STRING HANDLING ===")
    data = minimal_result['data']
    
    # Check some fields that should be 'None'
    fields_to_check = [
        'current_salary', 'availability', 'total_years_experience',
        'summary', 'professional_title'
    ]
    
    for field in fields_to_check:
        value = data.get(field, 'MISSING')
        value_type = type(value).__name__
        print(f"{field}: {repr(value)} (type: {value_type})")
    
    # Check nested objects
    print("\nContact info fields:")
    contact = data.get('contact_info', {})
    for key, value in contact.items():
        if value != [] and key in ['phone', 'address', 'linkedin', 'github']:
            print(f"  {key}: {repr(value)} (type: {type(value).__name__})")
            
else:
    print(f"Minimal parsing failed: {minimal_result['error']}")


## 3. Testing Job Parsing with Improved Schema

In [None]:
# Get API keys
google_api_key = os.getenv('CUSTOM_SEARCH_API_KEY')
search_engine_id = os.getenv('GOOGLE_SEARCH_ENGINE_ID')
groq_api_key = os.getenv('GROQ_API_KEY')

print(f"Google API Key: {'✓ Set' if google_api_key else '✗ Missing'}")
print(f"Search Engine ID: {'✓ Set' if search_engine_id else '✗ Missing'}")
print(f"Groq API Key: {'✓ Set' if groq_api_key else '✗ Missing'}")

if google_api_key and search_engine_id:
    print("\nTesting job parsing with LLM...")
    job_results = search_linkedin_jobs_google(
        api_key=google_api_key,
        search_engine_id=search_engine_id,
        keyword="python developer",
        location="remote",
        num_results=2,
        parsing_method="llm"
    )
    
    print(f"\nJob Search Success: {job_results['success']}")
    print(f"Parsing Method: {job_results.get('parsing_method')}")
    print(f"Total Found: {job_results['total_found']}")
    
    if job_results['jobs']:
        print("\n=== FIRST JOB PARSED DATA ===")
        pprint(job_results['jobs'][0], width=120)
    else:
        print("No jobs found or parsing failed")
        if not job_results['success']:
            print(f"Error: {job_results.get('error')}")
else:
    print("\nSkipping job parsing test - API keys not configured")


## 4. Schema Field Analysis

In [None]:
# Analyze the schema structures
print("=== RESUME SCHEMA FIELDS ===")
resume_fields = list(ResumeSchema.__fields__.keys())
print(f"Total fields: {len(resume_fields)}")
for i, field in enumerate(resume_fields, 1):
    print(f"{i:2d}. {field}")

print("\n=== JOB SCHEMA FIELDS ===")
job_fields = list(JobSchema.__fields__.keys())
print(f"Total fields: {len(job_fields)}")
for i, field in enumerate(job_fields, 1):
    print(f"{i:2d}. {field}")

# Test schema instantiation
print("\n=== SCHEMA VALIDATION TEST ===")
try:
    # Create empty instances to see default values
    empty_resume = ResumeSchema()
    empty_job = JobSchema()
    print("✓ Schemas can be instantiated with default values")
    
    # Check a few default values
    print(f"Resume default name: {repr(empty_resume.full_name)}")
    print(f"Job default title: {repr(empty_job.title)}")
    print(f"Job default company name: {repr(empty_job.company_info.name)}")
    
except Exception as e:
    print(f"✗ Schema validation error: {e}")

print("\n=== SUMMARY ===")
print("• Enhanced ResumeSchema with comprehensive fields for CV parsing")
print("• Enhanced JobSchema with detailed job information structure")  
print("• All missing fields return 'None' string instead of null")
print("• LLM prompts updated to enforce proper 'None' handling")
print("• Ready for CV vs Job matching analysis")
