# LLM Coding Capability Evaluation

This notebook evaluates the coding capabilities of LLMs using two metrics:
1. **Exact Match (EM)**: Token-by-token comparison with reference implementation
2. **Pass@k**: Whether at least one of k generations passes all unit tests

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import numpy as np
from typing import List, Dict, Any
import re

## 1. Load Dataset

In [None]:
def load_dataset(file_path: str) -> List[Dict[str, Any]]:
    """Load JSONL dataset."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load the evaluation dataset
dataset_path = '../data/eval/code_samples_full.jsonl'
dataset = load_dataset(dataset_path)
print(f"Loaded {len(dataset)} samples")
print(f"\nExample sample:")
print(json.dumps(dataset[0], indent=2))

## 2. Load LLM Model

In [None]:
# Configure your model path here
MODEL_PATH = "Qwen/Qwen2.5-Coder-3B-Instruct"  # Update this with your actual model path

# Load model and tokenizer
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("Model loaded successfully!")

## 3. Code Generation Function

In [None]:
def create_prompt(sample: Dict[str, Any]) -> str:
    """Create a prompt for code generation."""
    language = sample['language']
    func_name = sample['func_name']
    docstring = sample['docstring']
    
    # Strict prompt to get clean Java code
    prompt = f"""You are a Java code generator.

Write ONLY the Java method for this task. Follow these rules:
- Output MUST start with 'public static'
- Output MUST be a complete method
- NO explanations, NO comments, NO markdown

Task: {docstring}
Function name: {func_name}

Write the complete Java method:"""
    return prompt

def clean_generated_code(raw_output: str) -> str:
    """Clean and extract the method from raw model output."""
    code = raw_output.strip()
    
    # Remove markdown code blocks
    code = re.sub(r'```java\s*', '', code)
    code = re.sub(r'```\s*', '', code)
    
    # Remove any explanatory text before the code
    if 'public static' in code:
        code = code[code.index('public static'):]
    
    # Extract just the first method (find matching braces)
    if 'public static' in code:
        # Count braces to find the complete method
        brace_count = 0
        in_method = False
        method_chars = []
        
        for char in code:
            if char == '{':
                brace_count += 1
                in_method = True
            if in_method:
                method_chars.append(char)
            if char == '}':
                brace_count -= 1
                if in_method and brace_count == 0:
                    break
        
        if method_chars:
            code = ''.join(method_chars)
            # Add back the signature before the opening brace
            if '{' in code:
                sig_end = code.index('{')
                # Find signature from original code
                sig_match = re.search(r'public\s+static\s+[^{]+', raw_output)
                if sig_match:
                    signature = sig_match.group(0).strip()
                    code = signature + ' ' + code[sig_end:]
    
    # Alternative: simple extraction with rfind for closing brace
    if 'public static' in code and '}' in code:
        # Find last closing brace
        last_brace = code.rfind('}')
        code = code[:last_brace + 1]
    
    return code.strip()

def generate_code(prompt: str, num_samples: int = 1, max_length: int = 256, temperature: float = 0.2, top_p: float = 0.95) -> List[str]:
    """Generate code completions from the model.
    
    Args:
        prompt: The input prompt
        num_samples: Number of completions to generate (for Pass@k)
        max_length: Maximum length of generated tokens
        temperature: Sampling temperature (default 0.2 for more focused output)
        top_p: Nucleus sampling parameter
    
    Returns:
        List of cleaned generated code strings
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate multiple samples
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        num_return_sequences=num_samples,
        temperature=temperature if num_samples > 1 else 0.1,
        top_p=top_p,
        do_sample=num_samples > 1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Decode and clean outputs
    generated_texts = []
    for output in outputs:
        # Decode full output
        full_text = tokenizer.decode(output, skip_special_tokens=True)
        
        # Remove the prompt
        if prompt in full_text:
            generated_text = full_text[len(prompt):].strip()
        else:
            # Fallback: decode only new tokens
            gen_ids = output[inputs["input_ids"].shape[1]:]
            generated_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
        
        # Clean and extract the method
        cleaned_code = clean_generated_code(generated_text)
        generated_texts.append(cleaned_code)
    
    return generated_texts

# Test code generation
test_sample = dataset[0]
test_prompt = create_prompt(test_sample)
print("Test prompt:")
print(test_prompt)
print("\nGenerating code...")
test_output = generate_code(test_prompt, num_samples=1)
print("\nGenerated code:")
print(test_output[0])
print("\nReference code:")
print(test_sample['code'])

## 4. Evaluation Metrics

### 4.1 Exact Match (EM)

In [None]:
def normalize_code(code: str) -> str:
    """Normalize code by removing extra whitespace."""
    # Remove leading/trailing whitespace
    code = code.strip()
    # Normalize internal whitespace (optional - can make matching more lenient)
    # code = ' '.join(code.split())
    return code

def exact_match(generated_code: str, reference_code: str) -> int:
    """Calculate exact match score (1 if match, 0 otherwise).
    
    Args:
        generated_code: Code generated by the model
        reference_code: Ground truth reference code
    
    Returns:
        1 if exact match, 0 otherwise
    """
    gen_normalized = normalize_code(generated_code)
    ref_normalized = normalize_code(reference_code)
    
    return 1 if gen_normalized == ref_normalized else 0

# Test exact match
print("Testing Exact Match metric:")
print(f"Match: {exact_match(test_sample['code'], test_sample['code'])}")
print(f"No match: {exact_match('different code', test_sample['code'])}")

### 4.2 Pass@k Metric

In [None]:
def extract_function_code(generated_text: str, func_name: str) -> str:
    """Extract the function code from generated text."""
    # Try to find the function definition
    # This is a simple extraction - you may need to customize based on your LLM's output format
    lines = generated_text.strip().split('\n')
    code_lines = []
    in_function = False
    
    for line in lines:
        if func_name in line and ('public' in line or 'private' in line or 'static' in line):
            in_function = True
        if in_function:
            code_lines.append(line)
            # Simple heuristic: if we find a closing brace at the start of line, function might end
            if line.strip() == '}':
                break
    
    return '\n'.join(code_lines) if code_lines else generated_text

def run_tests(code: str, test_cases: List[Dict]) -> bool:
    """Run test cases against generated code.
    
    Note: This is a placeholder. For Java code, you would need to:
    1. Write the code to a .java file
    2. Compile it
    3. Run test cases
    4. Check if all tests pass
    
    For now, we'll use exact match as a proxy for passing tests.
    """
    # Placeholder implementation
    # In a real scenario, you would compile and execute the code
    return True  # Placeholder

def calculate_pass_at_k(completions: List[str], reference_code: str, k: int = 1) -> float:
    """Calculate Pass@k metric.
    
    Args:
        completions: List of generated code completions
        reference_code: Ground truth reference code
        k: Number of completions to consider
    
    Returns:
        1.0 if at least one of the top-k completions passes, 0.0 otherwise
    """
    # Limit to k completions
    completions_to_check = completions[:k]
    
    # For this implementation, we'll use exact match as a proxy for "passing tests"
    # In a real scenario, you would run actual unit tests
    for completion in completions_to_check:
        if exact_match(completion, reference_code):
            return 1.0
    
    return 0.0

# Test Pass@k
print("Testing Pass@k metric:")
test_completions = ["wrong code", test_sample['code'], "another wrong code"]
print(f"Pass@1: {calculate_pass_at_k(test_completions, test_sample['code'], k=1)}")
print(f"Pass@3: {calculate_pass_at_k(test_completions, test_sample['code'], k=3)}")

## 5. Run Full Evaluation

In [None]:
def evaluate_model(dataset: List[Dict], k_values: List[int] = [1, 5, 10], num_samples: int = 10):
    """Run complete evaluation on the dataset.
    
    Args:
        dataset: List of evaluation samples
        k_values: List of k values for Pass@k metric
        num_samples: Number of completions to generate per problem (should be >= max(k_values))
    
    Returns:
        Dictionary containing evaluation results
    """
    results = {
        'exact_match': [],
        'pass_at_k': {k: [] for k in k_values},
        'samples': []
    }
    
    for idx, sample in enumerate(tqdm(dataset, desc="Evaluating")):
        # Create prompt
        prompt = create_prompt(sample)
        
        # Generate completions
        completions = generate_code(prompt, num_samples=num_samples)
        
        # Calculate Exact Match for the first generation
        em_score = exact_match(completions[0], sample['code'])
        results['exact_match'].append(em_score)
        
        # Calculate Pass@k for different k values
        for k in k_values:
            pass_k_score = calculate_pass_at_k(completions, sample['code'], k=k)
            results['pass_at_k'][k].append(pass_k_score)
        
        # Store sample results
        results['samples'].append({
            'idx': idx,
            'func_name': sample['func_name'],
            'reference': sample['code'],
            'generated': completions[0],
            'all_completions': completions,
            'exact_match': em_score
        })
    
    # Calculate aggregate metrics
    results['aggregate'] = {
        'exact_match': np.mean(results['exact_match']),
        'pass_at_k': {k: np.mean(results['pass_at_k'][k]) for k in k_values}
    }
    
    return results

In [None]:
# Run evaluation
# Note: Adjust num_samples based on your computational resources
# For quick testing, use a subset of the dataset

# Option 1: Evaluate on full dataset
eval_results = evaluate_model(dataset, k_values=[1, 5, 10], num_samples=10)

# Option 2: Evaluate on a small subset for testing
# eval_results = evaluate_model(dataset[:5], k_values=[1, 5, 10], num_samples=10)

## 6. Display Results

In [None]:
# Print aggregate results
print("="*50)
print("EVALUATION RESULTS")
print("="*50)
print(f"\nDataset size: {len(dataset)} samples")
print(f"\nExact Match (EM): {eval_results['aggregate']['exact_match']:.2%}")
print(f"\nPass@k Scores:")
for k, score in eval_results['aggregate']['pass_at_k'].items():
    print(f"  Pass@{k}: {score:.2%}")

In [None]:
# Show some example comparisons
print("\n" + "="*50)
print("SAMPLE COMPARISONS")
print("="*50)

num_examples = min(3, len(eval_results['samples']))
for i in range(num_examples):
    sample = eval_results['samples'][i]
    print(f"\nExample {i+1}: {sample['func_name']}")
    print(f"Exact Match: {'✓' if sample['exact_match'] else '✗'}")
    print(f"\nReference Code:\n{sample['reference']}")
    print(f"\nGenerated Code:\n{sample['generated']}")
    print("-"*50)

In [None]:
# Visualize results
import matplotlib.pyplot as plt

# Plot Pass@k scores
k_values = list(eval_results['aggregate']['pass_at_k'].keys())
pass_k_scores = [eval_results['aggregate']['pass_at_k'][k] for k in k_values]

plt.figure(figsize=(10, 5))

# Pass@k plot
plt.subplot(1, 2, 1)
plt.bar([f"Pass@{k}" for k in k_values], pass_k_scores, color='skyblue')
plt.ylabel('Score')
plt.title('Pass@k Performance')
plt.ylim(0, 1)
for i, v in enumerate(pass_k_scores):
    plt.text(i, v + 0.02, f"{v:.2%}", ha='center')

# EM score
plt.subplot(1, 2, 2)
plt.bar(['Exact Match'], [eval_results['aggregate']['exact_match']], color='lightcoral')
plt.ylabel('Score')
plt.title('Exact Match Performance')
plt.ylim(0, 1)
plt.text(0, eval_results['aggregate']['exact_match'] + 0.02, 
         f"{eval_results['aggregate']['exact_match']:.2%}", ha='center')

plt.tight_layout()
plt.show()

## 7. Save Results

In [None]:
# Save detailed results to JSON
output_path = '../data/eval/evaluation_results.json'

# Prepare results for JSON serialization
json_results = {
    'model_path': MODEL_PATH,
    'dataset_size': len(dataset),
    'aggregate_metrics': eval_results['aggregate'],
    'detailed_samples': eval_results['samples'][:10]  # Save first 10 samples
}

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(json_results, f, indent=2)

print(f"Results saved to {output_path}")

## 8. Analysis by Function Type

In [None]:
# Analyze performance by function name
from collections import defaultdict

performance_by_func = defaultdict(list)
for i, sample_result in enumerate(eval_results['samples']):
    func_name = dataset[i]['func_name']
    performance_by_func[func_name].append(sample_result['exact_match'])

print("Performance by Function Type:")
print("="*50)
for func_name, scores in sorted(performance_by_func.items()):
    avg_score = np.mean(scores)
    count = len(scores)
    print(f"{func_name:20s}: {avg_score:.2%} ({count} samples)")