# Resume and Job Description Data Generator

The purpose of this notebook is to generate resumes and job description data for training a model that will assist job applicants finetune their resumes to an advertised role; we will use an open source model and create a ui with gradio

In [None]:
!pip install -q transformers accelerate bitsandbytes torch gradio

In [None]:
# imports

import os
import requests
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import json
import pandas as pd
import gradio as gr

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# Available models
AVAILABLE_MODELS = {
    "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
    "Phi-2": "microsoft/phi-2",
    "Gemma-2B": "google/gemma-2b-it",
    "TinyLlama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
}

In [None]:
# Available industries with roles
AVAILABLE_INDUSTRIES = {
    "Technology": ["Software Engineer", "Data Scientist", "Product Manager", "DevOps Engineer", "Frontend Developer"],
    "Healthcare": ["Registered Nurse", "Medical Assistant", "Healthcare Administrator", "Pharmacist", "Physical Therapist"],
    "Finance": ["Financial Analyst", "Accountant", "Investment Banker", "Risk Manager", "Portfolio Manager"],
    "Marketing": ["Digital Marketing Manager", "Content Strategist", "SEO Specialist", "Brand Manager", "Social Media Manager"],
    "Sales": ["Account Executive", "Sales Manager", "Business Development Rep", "Sales Engineer", "Customer Success Manager"],
    "Education": ["Teacher", "Curriculum Developer", "Academic Advisor", "Education Consultant", "Training Coordinator"],
}

## Load Model

In [None]:

# Global variables for model
current_model = None
current_pipeline = None
current_model_name = None


In [None]:
def load_model(model_name):
    """Load the selected model"""
    global current_model, current_pipeline, current_model_name

    # If model is already loaded, return
    if current_model_name == model_name and current_pipeline is not None:
        return f"✅ {model_name} already loaded!"

    # Clear previous model
    if current_model is not None:
        del current_model
        del current_pipeline
        torch.cuda.empty_cache()

    print(f"🤖 Loading {model_name}...")

    # Quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load model
    model_path = AVAILABLE_MODELS[model_name]
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    current_model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    # Create pipeline
    current_pipeline = pipeline(
        "text-generation",
        model=current_model,
        tokenizer=tokenizer,
        max_new_tokens=800,
        temperature=0.8,
        top_p=0.95,
        do_sample=True
    )

    current_model_name = model_name
    return f"✅ {model_name} loaded successfully!"


## Data Generation Functions

In [None]:
def generate_job_description(role, industry):
    """Generate a realistic job description"""
    prompt = f"""Create a detailed job description for a {role} position in the {industry} industry.

                    Include:
                    - Job title and company type
                    - Job overview (2-3 sentences)
                    - Key responsibilities (4-5 bullet points)
                    - Required qualifications (3-4 items)
                    - Preferred skills (2-3 items)

                    Job Description:
                    """

    result = current_pipeline(prompt, return_full_text=False)[0]['generated_text']
    return result.strip()


In [None]:
def generate_matching_resume(role, industry, job_description):
    """Generate a resume that matches the job description"""
    prompt = f"""Create a professional resume for a qualified {role} candidate applying to this position in {industry}.

                    Job Requirements Summary:
                    {job_description[:400]}...

                    Generate a resume with:
                    - Name and contact info
                    - Professional summary (2-3 sentences)
                    - Work experience (2-3 relevant positions with bullet points)
                    - Skills section (matching job requirements)
                    - Education

                    Resume:
                    """

    result = current_pipeline(prompt, return_full_text=False)[0]['generated_text']
    return result.strip()


In [None]:
# main generation function
def generate_synthetic_data(model_name, industry, selected_roles, num_samples):
    """Main function to generate synthetic data"""

    # Validate inputs
    if not selected_roles:
        return "❌ Please select at least one role!", None, None

    # Load model if needed
    status = load_model(model_name)
    if "❌" in status:
        return status, None, None

    synthetic_data = []
    progress_text = f"🚀 Generating data with {model_name}...\n\n"

    for role in selected_roles:
        progress_text += f"🔹 Generating {num_samples} samples for: {role}\n"

        for i in range(num_samples):
            # Generate job description
            job_desc = generate_job_description(role, industry)

            # Generate matching resume
            resume = generate_matching_resume(role, industry, job_desc)

            # Store the pair
            synthetic_data.append({
                'id': len(synthetic_data) + 1,
                'industry': industry,
                'role': role,
                'job_description': job_desc,
                'resume': resume
            })

            progress_text += f"   ✅ Sample {i+1}/{num_samples}\n"

    progress_text += f"\n✨ Generated {len(synthetic_data)} job-resume pairs!\n"

    # Save as JSON
    json_file = 'synthetic_resume_data.json'
    with open(json_file, 'w') as f:
        json.dump(synthetic_data, f, indent=2)

    # Save as CSV
    csv_file = 'synthetic_resume_data.csv'
    df = pd.DataFrame(synthetic_data)
    df.to_csv(csv_file, index=False)

    # Create preview
    preview = f"{'='*80}\n"
    preview += f"📊 GENERATED DATA SUMMARY\n"
    preview += f"{'='*80}\n\n"
    preview += f"Total Samples: {len(synthetic_data)}\n"
    preview += f"Industry: {industry}\n"
    preview += f"Roles: {', '.join(selected_roles)}\n\n"

    # Show first sample
    sample = synthetic_data[0]
    preview += f"{'='*80}\n"
    preview += f"📄 SAMPLE #1\n"
    preview += f"{'='*80}\n\n"
    preview += f"Role: {sample['role']}\n\n"
    preview += f"JOB DESCRIPTION:\n{'-'*80}\n{sample['job_description'][:400]}...\n\n"
    preview += f"MATCHING RESUME:\n{'-'*80}\n{sample['resume'][:400]}...\n\n"

    return progress_text, preview, [json_file, csv_file]

## Gradio UI

In [None]:
with gr.Blocks(title="Resume Data Generator", theme=gr.themes.Soft()) as demo:

    gr.Markdown("""
    # 🎯 Synthetic Resume & Job Description Generator
    Generate realistic job descriptions and matching resumes using AI
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### ⚙️ Configuration")

            model_dropdown = gr.Dropdown(
                choices=list(AVAILABLE_MODELS.keys()),
                value="Phi-2",
                label="Select Model",
                info="Phi-2 is fastest, Mistral-7B is highest quality"
            )

            industry_dropdown = gr.Dropdown(
                choices=list(AVAILABLE_INDUSTRIES.keys()),
                value="Technology",
                label="Select Industry"
            )

            roles_checkbox = gr.CheckboxGroup(
                choices=AVAILABLE_INDUSTRIES["Technology"],
                value=["Software Engineer"],
                label="Select Roles",
                info="Choose one or more roles"
            )

            num_samples_slider = gr.Slider(
                minimum=1,
                maximum=5,
                value=2,
                step=1,
                label="Samples per Role",
                info="Number of job-resume pairs per role"
            )

            generate_btn = gr.Button("🚀 Generate Data", variant="primary", size="lg")

        with gr.Column(scale=2):
            gr.Markdown("### 📊 Generation Progress")
            progress_output = gr.Textbox(
                label="Status",
                lines=10,
                interactive=False
            )

            gr.Markdown("### 👀 Data Preview")
            preview_output = gr.Textbox(
                label="Sample Output",
                lines=15,
                interactive=False
            )

            download_files = gr.Files(
                label="📥 Download Generated Data",
                interactive=False
            )

    # Update roles when industry changes
    def update_roles(industry):
        return gr.CheckboxGroup(
            choices=AVAILABLE_INDUSTRIES[industry],
            value=[AVAILABLE_INDUSTRIES[industry][0]]
        )

    industry_dropdown.change(
        fn=update_roles,
        inputs=[industry_dropdown],
        outputs=[roles_checkbox]
    )

    # Generate button click
    generate_btn.click(
        fn=generate_synthetic_data,
        inputs=[model_dropdown, industry_dropdown, roles_checkbox, num_samples_slider],
        outputs=[progress_output, preview_output, download_files]
    )

    gr.Markdown("""
    ---
    ### 📝 Instructions:
    1. Select your preferred AI model (Phi-2 recommended for speed)
    2. Choose an industry
    3. Select one or more job roles
    4. Set number of samples per role
    5. Click "Generate Data"
    6. Download the JSON and CSV files

    **Note:** First generation will take longer as the model loads (~1-2 min)
    """)

In [None]:
#launch
demo.launch(share=True, debug=True)