In [2]:
!nvidia-smi


/bin/bash: line 1: nvidia-smi: command not found


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import numpy as np
from typing import List, Dict, Any
import re
from datasets import load_dataset
from collections import defaultdict

In [None]:
def load_dataset(file_path: str) -> List[Dict[str, Any]]:
    """Load JSONL dataset."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load the evaluation dataset
dataset_path = "/content/drive/MyDrive/Colab Notebooks/data/eval/code_samples_full.jsonl"
dataset = load_dataset(dataset_path)
print(f"Loaded {len(dataset)} samples")
print(f"\nExample sample:")
print(json.dumps(dataset[0], indent=2))

Loaded 60 samples

Example sample:
{
  "repo": "ExampleRepo0",
  "path": "src/handler/Class0.java",
  "func_name": "factorial",
  "language": "java",
  "code": "public static int factorial(int n) { if (n <= 1) return 1; return n * factorial(n - 1); }",
  "docstring": "Calculates factorial recursively.",
  "sha": "sha0000",
  "url": "https://example.com/repo/Class0.java#L0",
  "partition": "train"
}


In [None]:
!ls /root/.cache/huggingface

ls: cannot access '/root/.cache/huggingface': No such file or directory


In [None]:
MODEL_PATH = "Qwen/Qwen2.5-Coder-3B-Instruct"

# Load model and tokenizer
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("Model loaded successfully!")

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Model loaded successfully!


In [None]:
def create_prompt(sample: Dict[str, Any]) -> str:
    """Create a prompt for code generation."""
    language = sample['language']
    func_name = sample['func_name']
    docstring = sample['docstring']

    # Strict prompt to get clean Java code
    prompt = f"""You are a Java code generator.

Write ONLY the Java method for this task. Follow these rules:
- Output MUST start with 'public static'
- Output MUST be a complete method
- NO explanations, NO comments, NO markdown
- JUST ONE METHOD, JUST THE CODE, NO HALLUCINATE

Task: {docstring}
Function name: {func_name}

Write the complete Java method:"""
    return prompt

def clean_generated_code(raw_output: str) -> str:
    """Clean and extract the method from raw model output."""
    code = raw_output.strip()

    # Remove markdown code blocks
    code = re.sub(r'```java\s*', '', code)
    code = re.sub(r'```\s*', '', code)

    # Remove any explanatory text before the code
    if 'public static' in code:
        code = code[code.index('public static'):]

    # Extract just the first method (find matching braces)
    if 'public static' in code:
        # Count braces to find the complete method
        brace_count = 0
        in_method = False
        method_chars = []

        for char in code:
            if char == '{':
                brace_count += 1
                in_method = True
            if in_method:
                method_chars.append(char)
            if char == '}':
                brace_count -= 1
                if in_method and brace_count == 0:
                    break

        if method_chars:
            code = ''.join(method_chars)
            # Add back the signature before the opening brace
            if '{' in code:
                sig_end = code.index('{')
                # Find signature from original code
                sig_match = re.search(r'public\s+static\s+[^{]+', raw_output)
                if sig_match:
                    signature = sig_match.group(0).strip()
                    code = signature + ' ' + code[sig_end:]

    # Alternative: simple extraction with rfind for closing brace
    if 'public static' in code and '}' in code:
        # Find last closing brace
        last_brace = code.rfind('}')
        code = code[:last_brace + 1]

    return code.strip()

def generate_code(prompt: str, num_samples: int = 1, max_length: int = 256, temperature: float = 0.2, top_p: float = 0.95) -> List[str]:
    """Generate code completions from the model.

    Args:
        prompt: The input prompt
        num_samples: Number of completions to generate (for Pass@k)
        max_length: Maximum length of generated tokens
        temperature: Sampling temperature (default 0.2 for more focused output)
        top_p: Nucleus sampling parameter

    Returns:
        List of cleaned generated code strings
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate multiple samples
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        num_return_sequences=num_samples,
        temperature=temperature if num_samples > 1 else 0.1,
        top_p=top_p,
        do_sample=num_samples > 1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    generated_texts = []
    for output in outputs:
        # Decode full output
        full_text = tokenizer.decode(output, skip_special_tokens=True)

        # Remove the prompt
        if prompt in full_text:
            generated_text = full_text[len(prompt):].strip()
        else:
            # Fallback: decode only new tokens
            gen_ids = output[inputs["input_ids"].shape[1]:]
            generated_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

        # Clean and extract the method
        cleaned_code = clean_generated_code(generated_text)
        generated_texts.append(cleaned_code)

    return generated_texts


# Test code generation
test_sample = dataset[1]
test_prompt = create_prompt(test_sample)
print("Test prompt:")
print(test_prompt)
print("\nGenerating code...")
test_output = generate_code(test_prompt, num_samples=1)
print("\nGenerated code:")
print(test_output)
print("\nReference code:")
print(test_sample['code'])

In [None]:
# After you generate code:
output_path = "/content/drive/MyDrive/Colab Notebooks/Qwen-Origin-Eval/generation_results.txt"
for i in range (1,6):
  test_sample = dataset[i]
  test_prompt = create_prompt(test_sample)
  test_output = generate_code(test_prompt, num_samples=1)
  # Write to file
  with open(output_path, "a", encoding="utf-8") as f:
    f.write(f"Problem Name: {test_sample['func_name']}\n")
    f.write("=" * 80 + "\n\n")
    f.write("Generated Code:\n")
    f.write(test_output[0] + "\n\n")
    f.write("Reference Code:")
    f.write(test_sample['code'] + "\n\n\n" )

print(f"✅ Results written to {output_path}")


NameError: name 'dataset' is not defined

In [6]:
ds = load_dataset("wis-k/instruction-following-eval")

train_ds = ds["train"]
data = train_ds.select(range(30))
print(data)
print(len(data))

for i in range(len(data)):
    print(data[i])



Dataset({
    features: ['key', 'prompt', 'instruction_id_list', 'kwargs'],
    num_rows: 30
})
30
{'key': 1000, 'prompt': 'Write a 300+ word summary of the wikipedia page "https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli". Do not use any commas and highlight at least 3 sections that has titles in markdown format, for example *highlighted section part 1*, *highlighted section part 2*, *highlighted section part 3*.', 'instruction_id_list': ['punctuation:no_comma', 'detectable_format:number_highlighted_sections', 'length_constraints:number_words'], 'kwargs': [{'num_highlights': None, 'relation': None, 'num_words': None, 'num_placeholders': None, 'prompt_to_repeat': None, 'num_bullets': None, 'section_spliter': None, 'num_sections': None, 'capital_relation': None, 'capital_frequency': None, 'keywords': None, 'num_paragraphs': None, 'language': None, 'let_relation': None, 'letter': None, 'let_frequency': None, 'end_phrase': None, 'forbidden_words': None, 'keyword': None, 'freque

In [None]:
def generate_response(prompt: str, max_tokens: int = 1024, temperature: float = 0.2) -> str:
    """Generate response from model using chat template."""

    # Use Qwen's chat template
    messages = [
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and extract only the assistant response
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the prompt using chat template markers
    if "assistant" in full_text:
        response = full_text.split("assistant")[-1].strip()
    else:
        gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
        response = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    return response

# ========================================
# CHANGE THIS to test different samples
# ========================================
TEST_SAMPLE_INDEX = 2  # Đổi số này để test sample khác (0-29)

print(f"Testing with sample {TEST_SAMPLE_INDEX}...")
test_response = generate_response(data[TEST_SAMPLE_INDEX]['prompt'])
print("Test response generated!")

Testing with sample 2...
Test response generated!


In [None]:
# Load Sentence Transformer for judging
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

print("Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Embedding model loaded!")

Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Embedding model loaded!


In [None]:
def llm_judge_simple(instruction, response, constraint_type, params):
    """Use LLM to judge - parse ALL params!"""

    # Build evaluation prompt with ALL params
    prompt = f"""You are a judge. Check if this response follows the constraint.

INSTRUCTION: {instruction}

RESPONSE: {response}

CONSTRAINT: {constraint_type}
PARAMETERS: {params}

Does the response satisfy the constraint?
GUIDE:
First, read the CONSTRAINT to get the citeria of evaluation and the parameter name.
Second, in the PARAMETERS, find the keys that you see in the CONSTRAINT, this will be the threshold if the output is pass or not
For example, number_placeholders = 12, so if the output have 12 placeholders, it pass
RULE:
1. Read the constraint carefully
2. Check the response step-by-step
3. Be STRICT - if constraint says "no comma",if the response has a comma, this means it wrong
4. Answer ONLY "YES" if constraint is SATISFIED, "NO" if VIOLATED
Answer ONLY "YES" or "NO":"""

    # Use model to judge
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=10,  # Just need "YES" or "NO"
        temperature=0.1,  # Low temp for consistency
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract judge answer
    if "assistant" in full_text:
        judge_response = full_text.split("assistant")[-1].strip()
    else:
        gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
        judge_response = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    # Parse answer
    if "YES" in judge_response.upper():
        passed = True
        explanation = "LLM Judge: Constraint satisfied"
    else:
        passed = False
        explanation = "LLM Judge: Constraint violated"

    return passed, explanation

print("✓ LLM Judge function ready!")

✓ LLM Judge function ready!


In [None]:
def evaluate_single(instruction, response, instruction_id_list, kwargs_list):
    """Evaluate response using LLM judge - calculate pass rate."""
    results = []

    # Loop through each constraint
    for i, constraint_type in enumerate(instruction_id_list):
        # Get parameters for this constraint
        params = kwargs_list[i] if i < len(kwargs_list) else {}
        if params is None:
            params = {}

        # Use LLM judge with ALL params
        passed, explanation = llm_judge_simple(instruction, response, constraint_type, params)

        results.append({
            'constraint': constraint_type,
            'passed': passed,
            'explanation': explanation
        })

    # Calculate pass rate (ví dụ: 2/3 = 0.67)
    total_constraints = len(results)
    passed_constraints = sum(r['passed'] for r in results)
    pass_rate = passed_constraints / total_constraints if total_constraints > 0 else 0

    return {
        'results': results,
        'passed': passed_constraints,
        'total': total_constraints,
        'pass_rate': pass_rate  # Điểm từ 0.0 đến 1.0
    }

# ========================================
# Test evaluation (uses TEST_SAMPLE_INDEX from Cell 5)
# ========================================
print("Testing LLM judge with ORIGINAL MODEL OUTPUT...")
print("\n" + "=" * 80)
print(f"SAMPLE INDEX: {TEST_SAMPLE_INDEX}")
print("=" * 80)

print("\n" + "=" * 80)
print("ORIGINAL INSTRUCTION:")
print("=" * 80)
print(data[TEST_SAMPLE_INDEX]['prompt'])

print("\n" + "=" * 80)
print("ORIGINAL MODEL OUTPUT (Generated Response):")
print("=" * 80)
print(test_response)
print(f"\nWord count: {len(test_response.split())}")
print(f"Has commas: {',' in test_response}")
print(f"Highlighted sections: {len(re.findall(r'\\*[^*]+\\*', test_response))}")

print("\n" + "=" * 80)
print("CONSTRAINTS TO CHECK:")
print("=" * 80)
for i, constraint in enumerate(data[TEST_SAMPLE_INDEX]['instruction_id_list']):
    print(f"{i+1}. {constraint}")
    print(f"   Parameters: {data[TEST_SAMPLE_INDEX]['kwargs'][i]}")

test_eval = evaluate_single(
    data[TEST_SAMPLE_INDEX]['prompt'],
    test_response,
    data[TEST_SAMPLE_INDEX]['instruction_id_list'],
    data[TEST_SAMPLE_INDEX]['kwargs']
)

print("\n" + "=" * 80)
print("EVALUATION RESULTS:")
print("=" * 80)
print(f"Score: {test_eval['passed']}/{test_eval['total']} = {test_eval['pass_rate']:.1%}\n")

for r in test_eval['results']:
    status = '✓' if r['passed'] else '✗'
    print(f"{status} {r['constraint']}")
    print(f"  → {r['explanation']}")

print("=" * 80)

Testing LLM judge with ORIGINAL MODEL OUTPUT...

SAMPLE INDEX: 2

ORIGINAL INSTRUCTION:
Write a resume for a fresh high school graduate who is seeking their first job. Make sure to include at least 12 placeholder represented by square brackets, such as [address], [name].

ORIGINAL MODEL OUTPUT (Generated Response):
[Name]  
[Address]  
[City, State, ZIP Code]  
[Email Address]  
[Phone Number]  

Objective: To secure a position that allows me to apply my newly acquired knowledge and skills in a dynamic work environment.

Education:
High School - [School Name], [City, State]  
Graduated with Honors in [Major]

Skills:
- Strong analytical and problem-solving abilities
- Excellent communication and interpersonal skills
- Proficient in Microsoft Office Suite (Word, Excel, PowerPoint)
- Basic knowledge of computer programming languages (Python, Java)

Work Experience:
[Company Name], [City, State]  
Internship - [Position Title]  
[Start Date] - [End Date]
- Assisted in developing and maint

In [None]:
# Run evaluation on all samples
print("Running full evaluation...")
output_path = "/content/drive/MyDrive/Colab Notebooks/Qwen-Origin-Eval/instruction-eval-llm.txt"

all_results = []

with open(output_path, "w", encoding="utf-8") as f:
    f.write("INSTRUCTION FOLLOWING EVALUATION (LLM Judge)\n")
    f.write("=" * 80 + "\n\n")

    # Evaluate each sample
    for idx in tqdm(range(len(data)), desc="Evaluating"):
        sample = data[idx]

        # Generate response
        response = generate_response(sample['prompt'])

        # Evaluate with LLM judge
        eval_result = evaluate_single(
            sample['prompt'],
            response,
            sample['instruction_id_list'],
            sample['kwargs']
        )

        # Store with pass_rate
        all_results.append({
            'idx': idx,
            'pass_rate': eval_result['pass_rate']  # Store pass rate (0.0 to 1.0)
        })

        # Write to file
        f.write(f"\nSAMPLE {idx}\n")
        f.write("=" * 80 + "\n")
        f.write(f"Prompt: {sample['prompt'][:150]}...\n\n")
        f.write(f"Response: {response}\n\n")
        f.write(f"Score: {eval_result['passed']}/{eval_result['total']} = {eval_result['pass_rate']:.1%}\n\n")

        for r in eval_result['results']:
            status = '✓' if r['passed'] else '✗'
            f.write(f"{status} {r['constraint']}: {r['explanation']}\n")

        f.write("\n")

    # Calculate average pass rate across all samples
    total_samples = len(all_results)
    average_pass_rate = sum(r['pass_rate'] for r in all_results) / total_samples

    # Write summary
    f.write("\n" + "=" * 80 + "\n")
    f.write("FINAL SUMMARY\n")
    f.write("=" * 80 + "\n")
    f.write(f"Total Samples: {total_samples}\n")
    f.write(f"Average Pass Rate: {average_pass_rate:.1%}\n")
    f.write(f"\nBreakdown:\n")

    # Show distribution
    perfect = sum(1 for r in all_results if r['pass_rate'] == 1.0)
    partial = sum(1 for r in all_results if 0 < r['pass_rate'] < 1.0)
    failed = sum(1 for r in all_results if r['pass_rate'] == 0.0)

    f.write(f"  Perfect (100%): {perfect} samples\n")
    f.write(f"  Partial (>0% <100%): {partial} samples\n")
    f.write(f"  Failed (0%): {failed} samples\n")

print(f"\n✓ Evaluation complete!")
print(f"✓ Results saved to {output_path}")
print(f"\nAverage Pass Rate: {average_pass_rate:.1%}")

Running full evaluation...


Evaluating: 100%|██████████| 30/30 [05:08<00:00, 10.30s/it]


✓ Evaluation complete!
✓ Results saved to /content/drive/MyDrive/Colab Notebooks/Qwen-Origin-Eval/instruction-eval-llm.txt

Average Pass Rate: 71.7%



