In [1]:
!pip install -q datasets groq together transformers pandas numpy

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/111.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

Part 1: Prompt Design & Code Generation

In [2]:
import json
import time
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from datasets import load_dataset
import re
from datetime import datetime
from groq import Groq
import os

In [None]:
GROQ_API_KEY = "your-api-key"
groq_client = Groq(api_key=GROQ_API_KEY)


In [4]:
ds = load_dataset("openai/openai_humaneval")
humaneval = ds["test"].to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

openai_humaneval/test-00000-of-00001.par(…):   0%|          | 0.00/83.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

In [5]:
selected_indices = [0, 1, 2, 3, 4, 9, 11, 14, 25, 39]
selected_problems = humaneval.iloc[selected_indices].reset_index(drop=True)


In [6]:
print("Problem IDs:", selected_problems['task_id'].tolist())


Problem IDs: ['HumanEval/0', 'HumanEval/1', 'HumanEval/2', 'HumanEval/3', 'HumanEval/4', 'HumanEval/9', 'HumanEval/11', 'HumanEval/14', 'HumanEval/25', 'HumanEval/39']


In [7]:
class PromptStrategies:
    """Six different prompting strategies as required"""

    @staticmethod
    def chain_of_thought(prompt):
        return f"""Let's think about this step by step.

{prompt}

First, I'll understand the requirements, then write the solution.
Let me work through this methodically:"""

    @staticmethod
    def stepwise_cot(prompt):
        return f"""I'll solve this problem following these explicit steps:
Step 1: Parse the function signature and understand inputs/outputs
Step 2: Analyze the provided examples
Step 3: Identify edge cases to handle
Step 4: Write the implementation

{prompt}

Following each step:"""

    @staticmethod
    def self_planning(prompt):
        return f"""I need to create a plan before implementing this.

{prompt}

My implementation plan:
1. Data structures needed:
2. Algorithm approach:
3. Edge cases to handle:

Now implementing based on this plan:"""

    @staticmethod
    def self_debugging(prompt):
        return f"""I'll write this solution and then check it for bugs.

{prompt}

Initial implementation (then I'll review for bugs):"""

    @staticmethod
    def self_edit(prompt):
        return f"""I'll implement this focusing on clean, efficient code.

{prompt}

Here's my optimized solution:"""

    @staticmethod
    def self_repair(prompt):
        return f"""I'll write a robust solution with proper error handling.

{prompt}

Robust implementation with error handling:"""


In [8]:
def call_openai(prompt):
    """Call openai model (Model Family 1: Mixture of Experts)"""
    try:
        completion = groq_client.chat.completions.create(
            model="openai/gpt-oss-20b",
            messages=[
                {"role": "system", "content": "You are an expert Python programmer. Write clean, correct code."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1,
            max_tokens=800,
            top_p=1,
            stream=False
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"openai error: {e}")
        time.sleep(5)
        return ""

def call_llama(prompt):
    """Call Llama model (Model Family 2: Meta's LLaMA)"""
    try:
        completion = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {"role": "system", "content": "You are an expert Python programmer. Write clean, correct code."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1,
            max_tokens=800,
            top_p=1,
            stream=False
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Llama error: {e}")
        time.sleep(5)
        return ""

In [9]:
def extract_code(text):
    """Extract Python code from model response"""

    if "```python" in text:
        pattern = r'```python\n(.*?)```'
        matches = re.findall(pattern, text, re.DOTALL)
        if matches:
            return matches[-1].strip()

    if "```" in text:
        pattern = r'```\n(.*?)```'
        matches = re.findall(pattern, text, re.DOTALL)
        if matches:
            return matches[-1].strip()

    lines = text.split('\n')
    code_lines = []
    in_func = False
    base_indent = 0

    for line in lines:
        if line.strip().startswith('def '):
            in_func = True
            base_indent = len(line) - len(line.lstrip())
            code_lines = [line]
        elif in_func:
            if line.strip() == '':
                code_lines.append(line)
            elif len(line) - len(line.lstrip()) > base_indent:
                code_lines.append(line)
            elif line.strip() and not line[0].isspace():
                break
            else:
                code_lines.append(line)

    if code_lines:
        return '\n'.join(code_lines)

    return text.strip()

def test_solution(code, test_code, entry_point):
    """Test generated code against test cases"""
    try:
        namespace = {}

        exec(code, namespace)

        exec(test_code, namespace)

        if 'check' in namespace and entry_point in namespace:
            try:
                namespace['check'](namespace[entry_point])
                return True, "All tests passed"
            except AssertionError as e:
                return False, f"Test failed: {str(e)}"
            except Exception as e:
                return False, f"Runtime error: {str(e)}"
        else:
            return False, "Function not found in generated code"

    except SyntaxError as e:
        return False, f"Syntax error: {str(e)}"
    except Exception as e:
        return False, f"Execution error: {str(e)}"

In [10]:
def run_part1_experiment():
    """Part 1: Test prompting strategies (8 points)"""

    print("\n" + "="*60)
    print("PART 1: Prompt Design & Code Generation")
    print("Testing 6 strategies × 2 model families × 10 problems")
    print("="*60)

    strategies = {
        "CoT": PromptStrategies.chain_of_thought,
        "SCoT": PromptStrategies.stepwise_cot,
        "Self-Plan": PromptStrategies.self_planning,
        "Self-Debug": PromptStrategies.self_debugging,
        "Self-Edit": PromptStrategies.self_edit,
        "Self-Repair": PromptStrategies.self_repair
    }

    models = {
        "openai/gpt-oss-20b": call_openai,
        "Llama3-70B": call_llama
    }

    results = []
    total_tests = len(selected_problems) * len(strategies) * len(models)
    current_test = 0

    for prob_idx, problem in selected_problems.iterrows():
        print(f"\nProblem {prob_idx+1}/10: {problem['task_id']}")

        for strat_name, strat_func in strategies.items():
            for model_name, model_func in models.items():
                current_test += 1
                print(f"  [{current_test}/{total_tests}] {strat_name} + {model_name}: ", end="", flush=True)

                prompt = strat_func(problem['prompt'])

                response = ""
                for attempt in range(3):
                    response = model_func(prompt)
                    if response:
                        break
                    print("↻", end="", flush=True)
                    time.sleep(3)

                code = extract_code(response)
                success, error = test_solution(code, problem['test'], problem['entry_point'])

                results.append({
                    'problem_id': problem['task_id'],
                    'problem_idx': prob_idx,
                    'strategy': strat_name,
                    'model': model_name,
                    'success': success,
                    'error': error if not success else None,
                    'prompt_used': prompt[:300] + "..." if len(prompt) > 300 else prompt,
                    'response': response[:500] + "..." if len(response) > 500 else response,
                    'generated_code': code
                })

                print("Pass" if success else "Fail")

                time.sleep(2.5)

    return pd.DataFrame(results)

print("Starting Part 1 Experiment...")
part1_results = run_part1_experiment()


Starting Part 1 Experiment...

PART 1: Prompt Design & Code Generation
Testing 6 strategies × 2 model families × 10 problems

Problem 1/10: HumanEval/0
  [1/120] CoT + openai/gpt-oss-20b: Fail
  [2/120] CoT + Llama3-70B: Pass
  [3/120] SCoT + openai/gpt-oss-20b: Fail
  [4/120] SCoT + Llama3-70B: Fail
  [5/120] Self-Plan + openai/gpt-oss-20b: Fail
  [6/120] Self-Plan + Llama3-70B: False
True
Pass
  [7/120] Self-Debug + openai/gpt-oss-20b: Pass
  [8/120] Self-Debug + Llama3-70B: False
True
Pass
  [9/120] Self-Edit + openai/gpt-oss-20b: Pass
  [10/120] Self-Edit + Llama3-70B: Pass
  [11/120] Self-Repair + openai/gpt-oss-20b: ↻Fail
  [12/120] Self-Repair + Llama3-70B: Pass

Problem 2/10: HumanEval/1
  [13/120] CoT + openai/gpt-oss-20b: Pass
  [14/120] CoT + Llama3-70B: Fail
  [15/120] SCoT + openai/gpt-oss-20b: Pass
  [16/120] SCoT + Llama3-70B: Pass
  [17/120] Self-Plan + openai/gpt-oss-20b: ↻Fail
  [18/120] Self-Plan + Llama3-70B: ['()', '(())', '(()())']
Pass
  [19/120] Self-Debug + ope

In [11]:
print("\n" + "="*60)
print("PART 1: RESULTS ANALYSIS")
print("="*60)

overall_success = part1_results['success'].mean()
print(f"\nOverall Success Rate: {overall_success*100:.1f}%")
print(f"   Successful: {part1_results['success'].sum()}/{len(part1_results)}")

print("\nSuccess Rate by Strategy:")
strategy_stats = part1_results.groupby('strategy')['success'].agg(['mean', 'sum', 'count'])
strategy_stats['percentage'] = strategy_stats['mean'] * 100
strategy_stats = strategy_stats.sort_values('percentage', ascending=False)
for idx, row in strategy_stats.iterrows():
    print(f"   {idx}: {row['percentage']:.1f}% ({int(row['sum'])}/{int(row['count'])} passed)")

print("\nSuccess Rate by Model Family:")
model_stats = part1_results.groupby('model')['success'].agg(['mean', 'sum', 'count'])
model_stats['percentage'] = model_stats['mean'] * 100
for idx, row in model_stats.iterrows():
    print(f"   {idx}: {row['percentage']:.1f}% ({int(row['sum'])}/{int(row['count'])} passed)")

print("\nStrategy × Model Success Rate (%):")
cross_tab = part1_results.pivot_table(
    values='success',
    index='strategy',
    columns='model',
    aggfunc='mean'
) * 100
cross_tab['Average'] = cross_tab.mean(axis=1)
print(cross_tab.round(1))

print("\nTop 3 Strategy-Model Combinations:")
combo_stats = part1_results.groupby(['strategy', 'model'])['success'].mean().sort_values(ascending=False).head(3)
for (strat, model), rate in combo_stats.items():
    print(f"   {strat} + {model}: {rate*100:.1f}%")

part1_results.to_csv('part1_results.csv', index=False)
print("\nResults saved to part1_results.csv")


PART 1: RESULTS ANALYSIS

Overall Success Rate: 72.5%
   Successful: 87/120

Success Rate by Strategy:
   Self-Edit: 90.0% (18/20 passed)
   CoT: 85.0% (17/20 passed)
   Self-Debug: 85.0% (17/20 passed)
   SCoT: 65.0% (13/20 passed)
   Self-Repair: 60.0% (12/20 passed)
   Self-Plan: 50.0% (10/20 passed)

Success Rate by Model Family:
   Llama3-70B: 86.7% (52/60 passed)
   openai/gpt-oss-20b: 58.3% (35/60 passed)

Strategy × Model Success Rate (%):
model        Llama3-70B  openai/gpt-oss-20b  Average
strategy                                            
CoT                90.0                80.0     85.0
SCoT               60.0                70.0     65.0
Self-Debug         90.0                80.0     85.0
Self-Edit         100.0                80.0     90.0
Self-Plan         100.0                 0.0     50.0
Self-Repair        80.0                40.0     60.0

Top 3 Strategy-Model Combinations:
   Self-Plan + Llama3-70B: 100.0%
   Self-Edit + Llama3-70B: 100.0%
   CoT + Llama3-70B

Part 2: Debugging & Iterative Improvement


In [12]:
def run_part2_debugging():
    """Part 2: Debug failed cases"""

    print("\n" + "="*60)
    print("PART 2: Debugging & Iterative Improvement")
    print("="*60)

    failed = part1_results[~part1_results['success']]

    if len(failed) < 2:
        print("Less than 2 failed cases available!")
        failed = part1_results.head(2)
    else:
        failed = failed.groupby(['strategy', 'model']).first().head(2).reset_index()

    debug_results = []

    for idx in range(min(2, len(failed))):
        fail = failed.iloc[idx]
        print(f"\nDebugging Case {idx+1}:")
        print(f"   Problem: {fail['problem_id']}")
        print(f"   Original: {fail['strategy']} + {fail['model']}")
        print(f"   Error: {fail['error'][:100]}...")

        prob = selected_problems[selected_problems['task_id'] == fail['problem_id']].iloc[0]

        debug_prompt = f"""You are debugging Python code. Fix the following error.

PROBLEM DESCRIPTION:
{prob['prompt']}

FAILED CODE:
```python
{fail['generated_code']}
```

ERROR MESSAGE:
{fail['error']}

Please provide a CORRECTED version that will pass all test cases.
Think about what went wrong and fix it.

CORRECTED CODE:"""

        print("   Attempting fix with openai...", end="", flush=True)

        fixed_response = call_openai(debug_prompt)
        fixed_code = extract_code(fixed_response)

        success, error = test_solution(fixed_code, prob['test'], prob['entry_point'])

        debug_results.append({
            'problem_id': fail['problem_id'],
            'original_strategy': fail['strategy'],
            'original_model': fail['model'],
            'original_error': fail['error'],
            'debug_prompt': debug_prompt[:500] + "...",
            'fixed_code': fixed_code,
            'debug_success': success,
            'debug_error': error if not success else None,
            'improvement': "Fixed" if success else "Still failing"
        })

        print(f" {'Fixed!' if success else 'Still has issues'}")

        if success:
            print(f"   Analysis: The model successfully identified and fixed the issue.")
        else:
            print(f"   Analysis: Debugging attempt improved but didn't fully resolve the issue.")
            print(f"   Remaining error: {error[:100]}...")

        time.sleep(3)

    return debug_results

part2_results = run_part2_debugging()


PART 2: Debugging & Iterative Improvement

Debugging Case 1:
   Problem: HumanEval/1
   Original: CoT + Llama3-70B
   Error: Execution error: name 'List' is not defined...
   Attempting fix with openai... Fixed!
   Analysis: The model successfully identified and fixed the issue.

Debugging Case 2:
   Problem: HumanEval/0
   Original: CoT + openai/gpt-oss-20b
   Error: Syntax error: invalid syntax (<string>, line 42)...
   Attempting fix with openai... Fixed!
   Analysis: The model successfully identified and fixed the issue.


In [13]:
print("\nDebugging Summary:")
if part2_results:
    debug_success_rate = sum(r['debug_success'] for r in part2_results) / len(part2_results)
    print(f"   Success rate: {debug_success_rate*100:.0f}% ({sum(r['debug_success'] for r in part2_results)}/{len(part2_results)})")

    for r in part2_results:
        print(f"\n   {r['problem_id']}:")
        print(f"   - Original: {r['original_strategy']} + {r['original_model']}")
        print(f"   - Result: {r['improvement']}")



Debugging Summary:
   Success rate: 100% (2/2)

   HumanEval/1:
   - Original: CoT + Llama3-70B
   - Result: Fixed

   HumanEval/0:
   - Original: CoT + openai/gpt-oss-20b
   - Result: Fixed


Part 3: Innovation - Test-Driven Incremental Generation

In [14]:
def run_part3_innovation():

    print("\n" + "="*60)
    print("PART 3: Innovation - Test-Driven Incremental Generation")
    print("="*60)

    print("\nInnovation Strategy: Test-Driven Incremental Generation")
    print("   - Shows concrete test examples upfront")
    print("   - Iteratively refines based on test failures")
    print("   - Maximum 3 iterations per problem")

    def test_driven_generation(problem):
        test_lines = []
        for line in problem['test'].split('\n'):
            if 'assert candidate' in line:
                test_lines.append(line.strip())
                if len(test_lines) >= 2:
                    break

        prompt = f"""Look at these specific test cases that your function must pass:

{chr(10).join(test_lines)}

Now implement this function to pass these tests:

{problem['prompt']}

Focus on making sure your implementation handles these exact test cases correctly.
Write the complete function:"""

        return prompt

    def iterative_refinement(problem, model_func, model_name, max_iterations=3):

        iteration_results = []
        current_prompt = test_driven_generation(problem)

        for iteration in range(max_iterations):
            print(f"      Iteration {iteration+1}: ", end="", flush=True)

            response = model_func(current_prompt)
            code = extract_code(response)

            success, error = test_solution(code, problem['test'], problem['entry_point'])

            iteration_results.append({
                'iteration': iteration + 1,
                'success': success,
                'error': error if not success else None
            })

            print("Pass" if success else "Fail", end="", flush=True)

            if success:
                print(f" Success!")
                return {
                    'success': True,
                    'iterations': iteration + 1,
                    'final_code': code,
                    'iteration_details': iteration_results
                }

            current_prompt = f"""The previous code failed with this error:
{error}

Failed code:
```python
{code}
```

Fix this specific error and make the code pass all tests.
Remember the tests that must pass:
{chr(10).join([line.strip() for line in problem['test'].split(chr(10)) if 'assert candidate' in line][:2])}

Corrected code:"""

            time.sleep(3)

        print(f" Max iterations reached")
        return {
            'success': False,
            'iterations': max_iterations,
            'final_code': code,
            'iteration_details': iteration_results
        }

    test_problems = selected_problems.head(2)
    innovation_results = []

    print("\nTesting innovative strategy on selected problems:\n")

    for idx, prob in test_problems.iterrows():
        print(f"Problem: {prob['task_id']}")

        print(f"   openai/gpt-oss-20b (openai family):")
        mixtral_result = iterative_refinement(prob, call_openai, "openai")

        print(f"   Llama3-70B (LLaMA family):")
        llama_result = iterative_refinement(prob, call_llama, "Llama")

        innovation_results.append({
            'problem': prob['task_id'],
            'mixtral_success': mixtral_result['success'],
            'mixtral_iterations': mixtral_result['iterations'],
            'llama_success': llama_result['success'],
            'llama_iterations': llama_result['iterations']
        })

        time.sleep(3)

    print("\nComparative Analysis:")

    baseline_subset = part1_results[part1_results['problem_id'].isin(test_problems['task_id'])]
    baseline_rate = baseline_subset['success'].mean()

    total_innovation_tests = len(innovation_results) * 2
    innovation_successes = sum(r['mixtral_success'] + r['llama_success'] for r in innovation_results)
    innovation_rate = innovation_successes / total_innovation_tests if total_innovation_tests > 0 else 0

    print(f"\nPerformance Comparison:")
    print(f"      Baseline strategies (avg): {baseline_rate*100:.1f}%")
    print(f"      TDIG Innovation strategy: {innovation_rate*100:.1f}%")
    improvement = (innovation_rate - baseline_rate) * 100
    print(f"      Relative improvement: {'+' if improvement >= 0 else ''}{improvement:.1f}%")

    print(f"\nEfficiency Analysis:")
    avg_iterations = sum(r['mixtral_iterations'] + r['llama_iterations'] for r in innovation_results) / (len(innovation_results) * 2)
    print(f"      Average iterations needed: {avg_iterations:.1f}")
    print(f"      Success on first try: {sum(1 for r in innovation_results if r['mixtral_iterations'] == 1 or r['llama_iterations'] == 1)}/{total_innovation_tests}")

    return innovation_results

part3_results = run_part3_innovation()


PART 3: Innovation - Test-Driven Incremental Generation

Innovation Strategy: Test-Driven Incremental Generation
   - Shows concrete test examples upfront
   - Iteratively refines based on test failures
   - Maximum 3 iterations per problem

Testing innovative strategy on selected problems:

Problem: HumanEval/0
   openai/gpt-oss-20b (openai family):
      Iteration 1: Pass Success!
   Llama3-70B (LLaMA family):
      Iteration 1: Pass Success!
Problem: HumanEval/1
   openai/gpt-oss-20b (openai family):
      Iteration 1: Fail      Iteration 2: Fail      Iteration 3: Fail Max iterations reached
   Llama3-70B (LLaMA family):
      Iteration 1: Pass Success!

Comparative Analysis:

Performance Comparison:
      Baseline strategies (avg): 62.5%
      TDIG Innovation strategy: 75.0%
      Relative improvement: +12.5%

Efficiency Analysis:
      Average iterations needed: 1.5
      Success on first try: 2/4


In [15]:
print("\n" + "="*60)
print("Final Summary")
print("="*60)


print("\nKey Results:")
best_strategy = part1_results.groupby('strategy')['success'].mean().idxmax()
best_model = part1_results.groupby('model')['success'].mean().idxmax()
overall_rate = part1_results['success'].mean()

print(f"   • Best Strategy: {best_strategy} ({part1_results[part1_results['strategy']==best_strategy]['success'].mean()*100:.1f}%)")
print(f"   • Best Model: {best_model} ({part1_results[part1_results['model']==best_model]['success'].mean()*100:.1f}%)")
print(f"   • Overall Success Rate: {overall_rate*100:.1f}%")

if part2_results:
    debug_rate = sum(r['debug_success'] for r in part2_results) / len(part2_results)
    print(f"   • Debugging Success: {debug_rate*100:.0f}%")

if part3_results:
    innovation_success = sum(r['mixtral_success'] + r['llama_success'] for r in part3_results) / (len(part3_results) * 2)
    print(f"   • Innovation Strategy: {innovation_success*100:.0f}%")




Final Summary

Key Results:
   • Best Strategy: Self-Edit (90.0%)
   • Best Model: Llama3-70B (86.7%)
   • Overall Success Rate: 72.5%
   • Debugging Success: 100%
   • Innovation Strategy: 75%
