In [None]:
import subprocess
import sys

# Define the requirements array
requirements = [
    "transformers",
    "datasets",
    "torch",
    "openai",
    "numpy",
    "difflib"
]

# Install the packages
for package in requirements:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
import torch
from openai import AzureOpenAI
import numpy as np
from difflib import SequenceMatcher

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key="5178fd37076342ea94cc7d8cc40f6b89",
    api_version="2024-06-01",
    azure_endpoint="https://hkust.azure-api.net"
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load code-specific model
try:
    model_name = "Salesforce/codet5-small"
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype="auto"
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading model: {str(e)}")
    exit(1)

# Load ClassEval dataset
try:
    dataset = load_dataset("FudanSELab/ClassEval", split="test[:20]")
except Exception as e:
    print(f"Error loading dataset: {str(e)}")
    exit(1)

def prepare_prompt(example):
    """Prepare prompt from class information"""
    return f"""
    Class Description: {example['class_description']}
    
    Class Name: {example['class_name']}
    
    Methods to implement:
    {example['methods_info']}
    
    Generate the implementation:
    """

def compute_code_similarity(code1, code2):
    """Improved code similarity computation"""
    def normalize_code(code):
        return ' '.join(code.split()).lower()
    
    code1_normalized = normalize_code(code1)
    code2_normalized = normalize_code(code2)
    
    return SequenceMatcher(None, code1_normalized, code2_normalized).ratio()

def evaluate_lightweight_model(model, tokenizer, dataset):
    correct_implementations = 0
    total = len(dataset)
    results = []

    for example in dataset:
        try:
            prompt = prepare_prompt(example)
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_length=1024,
                    num_return_sequences=1,
                    temperature=0.7,
                    top_p=0.95,
                    do_sample=True,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )
            
            generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            similarity = compute_code_similarity(generated_code, example['solution_code'])
            if similarity > 0.8:
                correct_implementations += 1
            
            results.append({
                'class_name': example['class_name'],
                'similarity': similarity,
                'generated_code': generated_code
            })
            
        except Exception as e:
            print(f"Error processing {example['class_name']}: {str(e)}")
            results.append({
                'class_name': example['class_name'],
                'similarity': 0.0,
                'generated_code': '',
                'error': str(e)
            })

    accuracy = correct_implementations / total
    return accuracy, results

def evaluate_gpt4_model(client, dataset):
    correct_implementations = 0
    total = len(dataset)
    results = []

    for example in dataset:
        try:
            prompt = f"""Given the following class description and requirements, generate the Python implementation:
            
            Class Description: {example['class_description']}
            Class Name: {example['class_name']}
            Methods to implement: {example['methods_info']}
            
            Provide only the code implementation without any explanations."""

            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a Python code generation assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7
            )
            
            generated_code = response.choices[0].message.content.strip()
            
            similarity = compute_code_similarity(generated_code, example['solution_code'])
            if similarity > 0.8:
                correct_implementations += 1
                
            results.append({
                'class_name': example['class_name'],
                'similarity': similarity,
                'generated_code': generated_code
            })
            
        except Exception as e:
            print(f"Error processing {example['class_name']}: {str(e)}")
            results.append({
                'class_name': example['class_name'],
                'similarity': 0.0,
                'generated_code': '',
                'error': str(e)
            })

    accuracy = correct_implementations / total
    return accuracy, results

# Run evaluation
print("Evaluating lightweight model...")
accuracy_lightweight, results_lightweight = evaluate_lightweight_model(model, tokenizer, dataset)

print("Evaluating GPT-4-mini...")
accuracy_gpt4, results_gpt4 = evaluate_gpt4_model(client, dataset)

# Print results
print(f"\nLightweight Model (CodeT5) Accuracy: {accuracy_lightweight:.2f}")
print(f"GPT-4-mini Accuracy: {accuracy_gpt4:.2f}")

# Print detailed results for first few examples
print("\nDetailed Results (First 3 examples):")
for i in range(min(3, len(results_lightweight))):
    print(f"\nClass: {results_lightweight[i]['class_name']}")
    print(f"Lightweight Model Similarity: {results_lightweight[i]['similarity']:.2f}")
    print(f"GPT-4-mini Similarity: {results_gpt4[i]['similarity']:.2f}")
    print("\nLightweight Model Generated Code:")
    print(results_lightweight[i]['generated_code'])
    print("\nGPT-4-mini Generated Code:")
    print(results_gpt4[i]['generated_code'])