In [None]:
import json
import anthropic
from typing import Dict, List
import time
from pathlib import Path
import random

In [None]:
# Initialize Claude client
client = anthropic.Anthropic(
    api_key=os.environ["ANTHROPIC_API_KEY"]  # Read API key from environment variable
)

def generate_sprint_context(sprint_data: Dict, style_index: int) -> Dict:
    """
    Generate enhanced context for a single sprint using Claude API
    
    Args:
        sprint_data: Original sprint data with issues
        style_index: Which meeting note style to use (0-3)
    
    Returns:
        Enhanced sprint data with context and meeting-style notes
    """
    
    # Define meeting note style instructions
    style_instructions = [
        "Write the sprint goal as a planning meeting summary with team member mentions and discussion points",
        "Write the sprint goal as structured sprint planning decisions with priorities and blockers",
        "Write the sprint goal as conversational team agreement notes",
        "Write the sprint goal as brief action-oriented notes"
    ]
    
    prompt = f"""
Based on these sprint issues, generate realistic team context and rewrite the sprint goal as meeting notes.

Sprint Name: {sprint_data['sprint_name']}
Original Sprint Goal: {sprint_data['sprint_goal']}
Issues (separated by |||||):
{sprint_data['formatted_issues']}

Generate a realistic but diverse context including:
1. tech_stack: Choose appropriate technologies based on the issues (keep it high-level)
2. application_domain: Infer a plausible domain (e.g., "B2B SaaS platform", "E-commerce site")
3. team_composition: Simple team structure (e.g., "6-person team: 3 backend, 2 frontend, 1 QA")
4. sprint_focus: Main theme based on issues (e.g., "Security hardening", "Feature development")
5. product_stage: General phase (e.g., "Growing user base", "Mature product")

For the sprint goal notes:
{style_instructions[style_index]}

Make the context realistic and ensure it aligns with the actual issues.

Return ONLY a JSON object with this exact structure:
{{
    "tech_stack": "...",
    "application_domain": "...",
    "team_composition": "...",
    "sprint_focus": "...",
    "product_stage": "...",
    "sprint_goal_notes": "..."
}}
"""

    try:
        response = client.messages.create(
            model="claude-3-5-haiku-20241022",  # Using Haiku for cost efficiency
            max_tokens=400,
            temperature=0.8,  # Higher temperature for more diversity
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        
        # Parse the JSON response
        context = json.loads(response.content[0].text)
        
        # Add the context to the original sprint data
        enhanced_sprint = sprint_data.copy()
        enhanced_sprint['team_context'] = context
        
        return enhanced_sprint
        
    except Exception as e:
        print(f"Error processing sprint {sprint_data['sprint_name']}: {str(e)}")
        return sprint_data  # Return original if error

def process_dataset(input_file: str, output_file: str, start_from: int = 0):
    """
    Process the entire dataset file
    
    Args:
        input_file: Path to input JSONL file
        output_file: Path to output JSONL file
        start_from: Line number to start from (for resuming)
    """
    
    # Read all lines first to know the total
    with open(input_file, 'r', encoding='utf-8') as f:
        all_lines = f.readlines()
    
    total_sprints = len(all_lines)
    print(f"Total sprints to process: {total_sprints}")
    
    # Open output file in append mode if resuming
    mode = 'a' if start_from > 0 else 'w'
    
    with open(output_file, mode, encoding='utf-8') as out_f:
        for i, line in enumerate(all_lines[start_from:], start=start_from):
            try:
                # Parse sprint data
                sprint_data = json.loads(line.strip())
                
                # Rotate through meeting note styles
                style_index = i % 4
                
                # Generate enhanced context
                enhanced_sprint = generate_sprint_context(sprint_data, style_index)
                
                # Write to output file
                out_f.write(json.dumps(enhanced_sprint) + '\n')
                out_f.flush()  # Ensure data is written
                
                # Progress update
                if (i + 1) % 10 == 0:
                    print(f"Processed {i + 1}/{total_sprints} sprints")
                
                # Rate limiting (3 requests per second for Haiku)
                time.sleep(0.35)
                
            except Exception as e:
                print(f"Error on line {i}: {str(e)}")
                # Write original data if enhancement fails
                out_f.write(line)
                out_f.flush()

def estimate_cost(num_sprints: int) -> Dict[str, float]:
    """
    Estimate the cost for processing sprints
    
    Args:
        num_sprints: Number of sprints to process
        
    Returns:
        Cost breakdown
    """
    # Rough estimates
    avg_input_tokens = 250  # Sprint data + prompt
    avg_output_tokens = 300  # Context + meeting notes
    
    # Haiku pricing (per million tokens)
    input_price = 0.25
    output_price = 1.25
    
    total_input_tokens = num_sprints * avg_input_tokens
    total_output_tokens = num_sprints * avg_output_tokens
    
    input_cost = (total_input_tokens / 1_000_000) * input_price
    output_cost = (total_output_tokens / 1_000_000) * output_price
    
    return {
        "input_cost": input_cost,
        "output_cost": output_cost,
        "total_cost": input_cost + output_cost,
        "per_sprint_cost": (input_cost + output_cost) / num_sprints
    }

# Example usage
if __name__ == "__main__":
    # Configuration
    INPUT_FILE = "sprint_goals_training_data-qwen-3B.jsonl"
    OUTPUT_FILE = "enhanced_sprint_training_data.jsonl"
    
    # Estimate cost
    cost_estimate = estimate_cost(3000)
    print(f"Estimated costs for 3000 sprints:")
    print(f"  Input cost: ${cost_estimate['input_cost']:.2f}")
    print(f"  Output cost: ${cost_estimate['output_cost']:.2f}")
    print(f"  Total cost: ${cost_estimate['total_cost']:.2f}")
    print(f"  Per sprint: ${cost_estimate['per_sprint_cost']:.4f}")
    
    # Confirm before proceeding
    proceed = input("\nProceed with processing? (yes/no): ")
    
    if proceed.lower() == 'yes':
        # Process the dataset
        process_dataset(INPUT_FILE, OUTPUT_FILE)
        print("\nProcessing complete!")
    else:
        print("Processing cancelled.")
        
    # Example: Resume from line 500 if interrupted
    # process_dataset(INPUT_FILE, OUTPUT_FILE, start_from=500)