# Step 1 - Get and Prepare Input Datasets



In [87]:
# Constants
# Number of records we will process
N = 3

## Get the Test Dataset for constraint generation from github

In [88]:
!wget https://raw.githubusercontent.com/gunnusravani/CIF_Benchmark/refs/heads/main/data/outputs/constraint_category_data.csv -O "constraint_category_initial_data.csv"

zsh:1: command not found: wget


In [89]:
import pandas as pd
full_df = pd.read_csv("constraint_category_initial_data.csv")
full_df.head()

Unnamed: 0,dataset,instruction,code,test,Characteristics,constraints
0,ajibawa-2023/Python-Code-23k-ShareGPT,Calculate the distance between two points loca...,To calculate the distance between two points i...,,1) Logic is modularized using calculate_distan...,['Implement the distance calculation in a sepa...
1,ajibawa-2023/Python-Code-23k-ShareGPT,Assuming that the given sentence is stored in ...,"To achieve this, you can use Python's string m...",,1) Embed the HTML tag using an f-string to dyn...,"[""Provide a concise solution using Python's st..."
2,ajibawa-2023/Python-Code-23k-ShareGPT,Write a program in Python to find the factoria...,Here is a program in Python that finds the fac...,,1) The function factorial is implemented recur...,['The program must use recursion to calculate ...
3,ajibawa-2023/Python-Code-23k-ShareGPT,Create a Python class for Animal with the foll...,```python\nclass Animal:\n def __init__(sel...,,"The class defines six attributes: species, nam...","['Include detailed docstrings for each method,..."
4,ajibawa-2023/Python-Code-23k-ShareGPT,How can I decrypt a set of jokes that are encr...,"Sure, here's a Python function that decrypts t...",,The function is named decryptJokes and accepts...,['Provide a complete Python function implement...


In [90]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   dataset          60 non-null     object
 1   instruction      60 non-null     object
 2   code             60 non-null     object
 3   test             20 non-null     object
 4   Characteristics  60 non-null     object
 5   constraints      60 non-null     object
dtypes: object(6)
memory usage: 2.9+ KB


In [91]:
full_df = full_df[["dataset","instruction","code"]]
full_df.head()

Unnamed: 0,dataset,instruction,code
0,ajibawa-2023/Python-Code-23k-ShareGPT,Calculate the distance between two points loca...,To calculate the distance between two points i...
1,ajibawa-2023/Python-Code-23k-ShareGPT,Assuming that the given sentence is stored in ...,"To achieve this, you can use Python's string m..."
2,ajibawa-2023/Python-Code-23k-ShareGPT,Write a program in Python to find the factoria...,Here is a program in Python that finds the fac...
3,ajibawa-2023/Python-Code-23k-ShareGPT,Create a Python class for Animal with the foll...,```python\nclass Animal:\n def __init__(sel...
4,ajibawa-2023/Python-Code-23k-ShareGPT,How can I decrypt a set of jokes that are encr...,"Sure, here's a Python function that decrypts t..."


In [92]:
df = full_df.sample(n=N)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 52 to 29
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   dataset      3 non-null      object
 1   instruction  3 non-null      object
 2   code         3 non-null      object
dtypes: object(3)
memory usage: 96.0+ bytes


## Decide Dataset Sampling Ratios Based on Real-World Complexity
- Skipping for now

## Write transformation scripts for each dataset format

## Create a benchmark dataset of 1000–1200 examples



# Step 2 - Define Constraint Categories

In [93]:
categories = [
    "Code Structure and Modularity",
    "Input and Output Handling",
    "Error Handling and Robustness",
    "Data Processing and Transformation",
    "Performance and Optimization",
    "Library and API Usage",
    "Testing and Debugging",
    "Documentation and Readability",
    "Security and Privacy",
    "Reproducibility and Consistency",
    "Mathematical Computation",
    "File and Data Management",
    "UI and Interaction",
]
categories_str = "\n".join(categories)

# Step 3 Generating Constraints

In [94]:
!pwd

/Users/sravanigunnu/Library/CloudStorage/OneDrive-IBM/Documents/VS_Projects/Complex_Instruction_Following/scripts/constraints


In [95]:
import pandas as pd
import time
from openai import OpenAI
import os

if os.path.exists("/content"):
    from google.colab import userdata
    openai_api_key = userdata.get('OPENAPI_KEY')
else:
    openai_api_key = os.getenv("OPENAI_API_KEY")

  # Replace with your actual key
client = OpenAI(api_key=openai_api_key)


In [96]:
SYSTEM_PROMPT = f"""
You are a helpful assistant. You will be given a programming instruction and the corresponding code.
"""

In [97]:
def get_response(user_prompt,system_prompt=SYSTEM_PROMPT, max_retries=1):
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.3
            )
            # print(response)
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error on attempt {attempt+1}: {e}")
            time.sleep(2)
    return "[]"

## Map Categories to the Instructions

In [98]:
def get_relevant_categories(instruction, code,  max_retries=3):
    user_prompt = f"""You are a coding assistant designed to classify natural language code generation instructions into appropriate high-level constraint categories.

You must choose all applicable categories from the following 13 supercategories based on the instruction and the type of code that is expected to be written:
{categories}

Task:
Given the instruction, select all relevant supercategories that:
- Help define the expected constraints.
- Reflect either the code behavior or user expectation.
- Include at least 2 relevant categories per instruction.
- Understand the domain of the problem and if any code looks like might need privacy and security considerations include that category as well


Output Format:
Respond with only a list of strings as shown in the example output.
Each string should be the exact name of the matching supercategory.

Example Output:
["Mathematical Computation", "Input and Output Handling", "Documentation and Readability"]

Instruction:\n
{instruction}

Code:\n
{code}

Relevant categories:"""
    relevant_categories = get_response(user_prompt)
    return relevant_categories


get_relevant_categories(df.iloc[0]["instruction"],df.iloc[0]["code"])



'["Error Handling and Robustness", "Code Structure and Modularity"]'

## Prompt 2 for category mapping

In [99]:
def get_relevant_categories_v2(instruction, code):
    user_prompt = f"""You are a coding assistant designed to classify natural language code generation instructions into appropriate high-level constraint categories.

You must choose all applicable categories from the following 13 supercategories based on the instruction and the type of code that is expected to be written:
{categories}

Task:
Given the instruction, select all relevant supercategories that:
- Help define the expected constraints.
- Reflect either the code behavior or user expectation.
- Include at least 2 relevant categories per instruction.
- Understand the domain of the problem and if any code looks like might need privacy and security considerations include that category as well
- **Crucially, when considering a category, imagine the possible constraints that could arise from it for this specific instruction. If there is even a *slight or indirect possibility* for a constraint from that category to occur, or if the category could *potentially* influence the implementation based on implicit requirements, then include that category.** Lean towards being inclusive rather than exclusive in category selection.


Output Format:
Respond with only a list of strings as shown in the example output.
Each string should be the exact name of the matching supercategory.

Example Output:
["Mathematical Computation", "Input and Output Handling", "Documentation and Readability"]

Instruction:\n
{instruction}

Code:\n
{code}

Relevant categories:"""

    relevant_categories = get_response(user_prompt)
    return relevant_categories
get_relevant_categories_v2(df.iloc[0]["instruction"],df.iloc[0]["code"])

'["Error Handling and Robustness", "Code Structure and Modularity", "Input and Output Handling", "Documentation and Readability"]'

In [100]:
# get_relevant_categories(test_mceval.iloc[0]["instruction"],test_mceval.iloc[0]["code"])

In [101]:
# Run on first 20 rows (you can change this limit)
def map_categories(df,output_pth,input_col1,input_col2,output_col):
    print("Mapping categories")
    results = []
    for i, row in df.iterrows():
        print(f"Processing row {i}")
        categories = get_relevant_categories_v2(row[input_col1], row[input_col2])
        results.append(categories)

    df[output_col] = results
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")

In [102]:
map_categories(df,"step3_with_relevant_categories_v2.csv","instruction","code","relevant_categories")

Mapping categories
Processing row 52
Processing row 34
Processing row 29
Saved the file successfully


In [103]:
df.head(N)

Unnamed: 0,dataset,instruction,code,relevant_categories
52,nuprl/CanItEdit,Fix the methods in `Course` so that all of the...,import functools\nimport numpy as np\nclass St...,"[""Error Handling and Robustness"", ""Code Struct..."
34,Multilingual-Multimodal-NLP/McEval-Instruct,Write a Python function `rotate_text` that tak...,"```python\ndef rotate_text(text, rotation):\n ...","[""Code Structure and Modularity"", ""Input and O..."
29,bigcode/bigcodebench,"Search for occurrences of the word ""error"" in ...",if not os.path.isdir(dir_path):\n raise...,"[""Input and Output Handling"", ""Error Handling ..."


In [104]:
from collections import Counter
import ast

# Convert the string representation of lists to actual lists
df['relevant_categories'] = df['relevant_categories'].apply(ast.literal_eval)

# Flatten the list of lists and count occurrences
all_categories = [category for sublist in df['relevant_categories'] for category in sublist]
category_counts = Counter(all_categories)

# Create a DataFrame from the counts
category_df = pd.DataFrame.from_dict(category_counts, orient='index', columns=['Count'])

# Sort the categories by count
category_df = category_df.sort_values(by='Count', ascending=False)

display(category_df)

Unnamed: 0,Count
Input and Output Handling,3
Error Handling and Robustness,2
Code Structure and Modularity,2
Documentation and Readability,2
Testing and Debugging,1
File and Data Management,1


## Prompt to generate the constraints

In [105]:
def generate_constraints_v1(instruction,code,relevant_categories):
    prompt = f""" "task": "You are an expert in generating **actionable, precise, and objective** constraints for code generation tasks by analyzing a natural language instruction and the associated code. Your objective is to identify the relevant characteristics of the task based on the instruction and code, and then generate meaningful constraints that guide a model to correctly implement or reason about the task in line with predefined constraint categories. **Avoid generic or subjective constraints; each constraint must have a clear, verifiable requirement.**",

    "context": "You are provided with three inputs: (1) a natural language `instruction` describing a coding task, (2) the actual `code` implementing or partially implementing that instruction, and (3) a list of `relevant_constraint_categories` which represent different high-level aspects of software development such as performance, error handling, modularity, etc. that are relevant to the instruction. You must infer both explicit and implicit requirements from these inputs.",

    "goal": "Your job is to examine the instruction, code, and relevant constraint categories, and generate 5 to 10 natural language constraints that align with the relevant categories. These constraints should help an LLM generate code that is correct, complete, robust, and aligned with good development practices. **Crucially, the constraints must be specific, objective, and leave little room for subjective interpretation.** For example, instead of 'Write well-documented code,' specify 'Include docstrings for all functions and classes, detailing parameters, return values, and a brief purpose.' Each constraint should pose a tangible challenge for the model and be clearly verifiable. The aim is to create constraints that lead to a noticeable difference in model performance if adhered to, making them useful for differentiating models.",

    "JSON Response Format": {{
        "Constraints": [ {{ type: constraint_category_1, constraint: constraint1, instruction_part: "Snippet from the instruction for the constraints" }}, ]
    }},

    "Inputs Required": {{
        "instruction": {instruction}
        "code": {code}
        "relevant_constraint_categories": {relevant_categories}

    }} """
    return prompt

prompt = generate_constraints_v1(df.iloc[0]["instruction"],df.iloc[0]["code"],df.iloc[0]["relevant_categories"])
print(prompt)

 "task": "You are an expert in generating **actionable, precise, and objective** constraints for code generation tasks by analyzing a natural language instruction and the associated code. Your objective is to identify the relevant characteristics of the task based on the instruction and code, and then generate meaningful constraints that guide a model to correctly implement or reason about the task in line with predefined constraint categories. **Avoid generic or subjective constraints; each constraint must have a clear, verifiable requirement.**",

    "context": "You are provided with three inputs: (1) a natural language `instruction` describing a coding task, (2) the actual `code` implementing or partially implementing that instruction, and (3) a list of `relevant_constraint_categories` which represent different high-level aspects of software development such as performance, error handling, modularity, etc. that are relevant to the instruction. You must infer both explicit and impli

In [106]:
instruction = df.iloc[0]["instruction"]
print(instruction)

Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0. 
Additionally, do not use the words `for`, `while`, or `map` anywhere in the code.
class Student:
    def __init__(self, name, gpa) -> None:
        self.name = name
        self.gpa = gpa

    def __eq__(self, __value: object) -> bool:
        if not isinstance(__value, Student):
            return False
        else:
            return __value.name == self.name

class Course:

    def __init__(self, students) -> None:
        self.students = students

    def average_gpa(self):
        for student in self.students:
            total += student.gpa

        return total / len(self.students)
    
    def raise_grade_all(self):
        for student in self.students:
            student.gpa += 1

    def best_student(self):

        best = self.students[0]
        for student in self.students:
            if student.gpa > best.gpa:
                best = student


In [107]:
response = get_response(prompt)
print(response)

```json
{
    "Constraints": [
        {
            "type": "Error Handling and Robustness",
            "constraint": "Ensure that all methods in the Course class return None if the students list is empty, without raising any exceptions.",
            "instruction_part": "Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0."
        },
        {
            "type": "Error Handling and Robustness",
            "constraint": "Implement input validation in the Course constructor to ensure that the students parameter is a list of Student objects.",
            "instruction_part": "Fix the methods in `Course` so that all of them never throw errors."
        },
        {
            "type": "Code Structure and Modularity",
            "constraint": "Refactor the average_gpa method to use a single return statement that calculates and returns the average GPA directly.",
            "instruction_part": "Fix the methods

In [108]:
import json

def extract_constraints(constraint_string,col_name="Constraints"):
    try:
        json_string = constraint_string.strip().replace('```json\n', '', 1).replace('\n```', '', 1)
        constraint_json = json.loads(json_string)
        # print(constraint_json.get("Constraints", []))
        return constraint_json.get(col_name, [])
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e} in string: {constraint_string}")
        return []
    except AttributeError as e:
        print(f"Attribute error: {e} in string: {constraint_string}")
        return []



In [109]:
def generate_constraints(df,constraint_generation_method,output_pth,input_col1,input_col2,input_col3,output_col):
    results = []
    for i, row in df.iterrows():
        print(f"Processing row {i}")
        prompt = constraint_generation_method(row[input_col1], row[input_col2], row[input_col3])
        constraints = get_response(prompt)
        results.append(constraints)

    df[output_col] = results
    df['extracted_constraints'] = test_df[output_col].apply(extract_constraints)
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df

In [110]:
test_df = df.copy()
test_df = generate_constraints(test_df,generate_constraints_v1,"step4_with_constraints_method_1.csv","instruction","code","relevant_categories","constraints_m1")
test_df.head()

Processing row 52
Processing row 34
Processing row 29
Saved the file successfully


Unnamed: 0,dataset,instruction,code,relevant_categories,constraints_m1,extracted_constraints
52,nuprl/CanItEdit,Fix the methods in `Course` so that all of the...,import functools\nimport numpy as np\nclass St...,"[Error Handling and Robustness, Code Structure...","```json\n{\n ""Constraints"": [\n {\n ...","[{'type': 'Error Handling and Robustness', 'co..."
34,Multilingual-Multimodal-NLP/McEval-Instruct,Write a Python function `rotate_text` that tak...,"```python\ndef rotate_text(text, rotation):\n ...","[Code Structure and Modularity, Input and Outp...","```json\n{\n ""Constraints"": [\n {\n ...","[{'type': 'Code Structure and Modularity', 'co..."
29,bigcode/bigcodebench,"Search for occurrences of the word ""error"" in ...",if not os.path.isdir(dir_path):\n raise...,"[Input and Output Handling, Error Handling and...","```json\n{\n ""Constraints"": [\n {\n ...","[{'type': 'Input and Output Handling', 'constr..."


In [111]:
extract_constraints(test_df.iloc[0]["constraints_m1"],"Constraints")

[{'type': 'Error Handling and Robustness',
  'constraint': 'Ensure that all methods in the Course class return None if the students list is empty, including average_gpa, raise_grade_all, and best_student.',
  'instruction_part': 'Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0.'},
 {'type': 'Error Handling and Robustness',
  'constraint': 'Implement error handling to check if the students list contains any non-Student objects and raise a ValueError if it does.',
  'instruction_part': 'Fix the methods in `Course` so that all of them never throw errors.'},
 {'type': 'Code Structure and Modularity',
  'constraint': 'Refactor the raise_grade_all method to avoid using reduce; instead, use a list comprehension to create a new list of students with updated grades.',
  'instruction_part': 'Additionally, do not use the words `for`, `while`, or `map` anywhere in the code.'},
 {'type': 'Input and Output Handling',
  'c

# Validating the Constraints

In [112]:
# SYSTEM_PROMPT for the Quality Judge LLM
SYSTEM_PROMPT_QUALITY_JUDGE = """
You are an expert in meticulously evaluating the quality of programming constraints.
Your task is to objectively assess a provided list of generated constraints based on strict quality criteria.
For each constraint, you will provide specific scores for Specificity, Objectivity, and Atomicity, along with detailed reasoning and suggestions for improvement.
Finally, you will synthesize these individual evaluations into a unified quality score and an overall analysis for the entire set of constraints.
Your judgment must be impartial and directly tied to the definitions provided.
"""

# User-facing prompt for the Quality Judge LLM
def get_quality_judge_prompt(original_instruction, original_code, generated_constraint_list):
    return f"""
Original Instruction: {original_instruction}

Original Code (for context, if available):
```python
{original_code}
List of Generated Constraints to Evaluate:
{generated_constraint_list}

Constraint Quality Criteria:
Each score ranges from 1 (poor) to 5 (excellent).

Specificity Score (1-5):

Definition: Measures whether a constraint is clearly defined and narrow in scope, avoiding broad generalizations or vague statements. A highly specific constraint focuses on a precise requirement.
Examples:
1 (Very Generic): "Write efficient code."
5 (Highly Specific): "Ensure the search_algorithm function has a time complexity of O(log N) for typical inputs."
Objectivity Score (1-5):

Definition: Measures the extent to which a constraint can be verified or falsified without relying on personal opinion or subjective interpretation. An objective constraint uses measurable, observable, or clearly definable criteria.
Examples:
1 (Highly Subjective): "The user interface should be visually appealing."
5 (Highly Objective): "All input parameters must be validated to ensure they are non-null."
Atomicity Score (1-5):

Definition: Measures whether a single constraint statement expresses only one specific, indivisible requirement or condition. An atomic constraint cannot be broken down further into separate, independent requirements while still making logical sense as a single constraint.
Examples:
1 (Non-Atomic): "Include docstrings for all functions and handle all potential errors gracefully."
5 (Atomic): "Include a docstring for every function, describing its purpose, parameters, and return values."
Output Format:
Your response must be a single JSON object.

{{
"constraint_evaluations": [
{{
"constraint_text": "The exact text of the constraint from the input list.",
"specificity_score": int, // Score from 1 to 5
"objectivity_score": int, // Score from 1 to 5
"atomicity_score": int,   // Score from 1 to 5
"reasoning": "Detailed explanation for each score and, if scores are low, suggestions on how to improve the constraint's specificity, objectivity, or atomicity."
}},
// ... for each constraint in the input list
],
"avg_specificity": float, // Average of all specificity_scores in constraint_evaluations
"avg_objectivity": float, // Average of all objectivity_scores in constraint_evaluations
"avg_atomicity": float,   // Average of all atomicity_scores in constraint_evaluations
"unified_quality_score": float, // Average of avg_specificity, avg_objectivity, and avg_atomicity
"overall_analysis": "A brief summary of the overall quality of the constraint set, highlighting its strengths (e.g., highly specific) and weaknesses (e.g., some constraints are still too generic or non-atomic)."
}} """


prompt = get_quality_judge_prompt(test_df.iloc[0]["instruction"],test_df.iloc[0]["code"],test_df.iloc[0]["extracted_constraints"])
print(prompt)


Original Instruction: Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0. 
Additionally, do not use the words `for`, `while`, or `map` anywhere in the code.
class Student:
    def __init__(self, name, gpa) -> None:
        self.name = name
        self.gpa = gpa

    def __eq__(self, __value: object) -> bool:
        if not isinstance(__value, Student):
            return False
        else:
            return __value.name == self.name

class Course:

    def __init__(self, students) -> None:
        self.students = students

    def average_gpa(self):
        for student in self.students:
            total += student.gpa

        return total / len(self.students)
    
    def raise_grade_all(self):
        for student in self.students:
            student.gpa += 1

    def best_student(self):

        best = self.students[0]
        for student in self.students:
            if student.gpa > best.gpa:
        

In [113]:
response = get_response(prompt,SYSTEM_PROMPT_QUALITY_JUDGE)
print(response)

{
  "constraint_evaluations": [
    {
      "constraint_text": "Ensure that all methods in the Course class return None if the students list is empty, including average_gpa, raise_grade_all, and best_student.",
      "specificity_score": 4,
      "objectivity_score": 5,
      "atomicity_score": 4,
      "reasoning": "This constraint is specific about which methods need to be modified and what the expected behavior is when the students list is empty. It can be objectively verified by checking the implementation of these methods. However, it could be more atomic by separating the requirements for each method into individual constraints."
    },
    {
      "constraint_text": "Implement error handling to check if the students list contains any non-Student objects and raise a ValueError if it does.",
      "specificity_score": 4,
      "objectivity_score": 5,
      "atomicity_score": 4,
      "reasoning": "This constraint clearly specifies the need for error handling and the type of error 

In [114]:
def get_dict(response):
    try:
        json_string = response.strip().replace('```json\n', '', 1).replace('\n```', '', 1)
        dict_json = json.loads(json_string)
        return dict_json
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e} in string: {response}")
        return []
    except AttributeError as e:
        print(f"Attribute error: {e} in string: {response}")
        return []


In [115]:
# extract the final scores and keep them in separated 4 columns
def extract_quality_scores(df,score_column="quality_scores"):
    df["specificity_score"] = df[score_column].apply(lambda x: x.get("avg_specificity", 0) if isinstance(x, dict) else 0)
    df["objectivity_score"] = df[score_column].apply(lambda x: x.get("avg_objectivity", 0) if isinstance(x, dict) else 0)
    df["atomicity_score"] = df[score_column].apply(lambda x: x.get("avg_atomicity", 0) if isinstance(x, dict) else 0)
    df["unified_quality_score"] = df[score_column].apply(lambda x: x.get("unified_quality_score", 0) if isinstance(x, dict) else 0)
    return df



In [116]:
def measure_constraints(df,output_pth,input_col1,input_col2,input_col3,output_col):
    results = []
    for i, row in df.iterrows():
        print(f"Processing row {i}")
        prompt = get_quality_judge_prompt(row[input_col1], row[input_col2], row[input_col3])
        scores = get_response(prompt,SYSTEM_PROMPT_QUALITY_JUDGE)
        final_scores = get_dict(scores)
        results.append(final_scores)

    df[output_col] = results
    df = extract_quality_scores(df)
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df

In [117]:
test_df = measure_constraints(test_df,"step5_with_quality_scores_m1.csv","instruction","code","extracted_constraints","quality_scores")
test_df.head()

Processing row 52
Processing row 34
Processing row 29
Saved the file successfully


Unnamed: 0,dataset,instruction,code,relevant_categories,constraints_m1,extracted_constraints,quality_scores,specificity_score,objectivity_score,atomicity_score,unified_quality_score
52,nuprl/CanItEdit,Fix the methods in `Course` so that all of the...,import functools\nimport numpy as np\nclass St...,"[Error Handling and Robustness, Code Structure...","```json\n{\n ""Constraints"": [\n {\n ...","[{'type': 'Error Handling and Robustness', 'co...",{'constraint_evaluations': [{'constraint_text'...,4.57,4.71,4.43,4.57
34,Multilingual-Multimodal-NLP/McEval-Instruct,Write a Python function `rotate_text` that tak...,"```python\ndef rotate_text(text, rotation):\n ...","[Code Structure and Modularity, Input and Outp...","```json\n{\n ""Constraints"": [\n {\n ...","[{'type': 'Code Structure and Modularity', 'co...",{'constraint_evaluations': [{'constraint_text'...,4.33,4.33,4.0,4.22
29,bigcode/bigcodebench,"Search for occurrences of the word ""error"" in ...",if not os.path.isdir(dir_path):\n raise...,"[Input and Output Handling, Error Handling and...","```json\n{\n ""Constraints"": [\n {\n ...","[{'type': 'Input and Output Handling', 'constr...",{'constraint_evaluations': [{'constraint_text'...,4.4,4.8,4.4,4.53


# Method 2 of Constrant Generation

In [118]:
def get_prompt_constraint_generation_v2(instruction, code, all_constraint_categories):
    prompt = f""" "task": "You are an expert in generating **actionable, precise, and objective** constraints for code generation tasks by analyzing a natural language instruction and the associated code. Your primary objective is to first, identify the **most suitable and relevant constraint categories** from a provided comprehensive list based on the natural language instruction and associated code. Second, you will then generate meaningful constraints that guide a model to correctly implement or reason about the task in line with these *selected* categories. **Avoid generic or subjective constraints; each constraint must have a clear, verifiable requirement.**",

    "context": "You are provided with three inputs: (1) a natural language `instruction` describing a coding task, (2) the actual `code` implementing or partially implementing that instruction, and (3) a **comprehensive list of all available constraint categories** (all_constraint_categories). You must infer both explicit and implicit requirements from these inputs.",

    "goal": "Your job is to:
    1.  **Select all relevant constraint categories** from the `all_constraint_categories` list that precisely apply to the given `instruction` and `code`. Ensure you select at least 2 relevant categories.
        **When considering a category, imagine the possible constraints that could arise from it for this specific instruction. If there is even a *slight or indirect possibility* for a constraint from that category to occur, or if the category could *potentially* influence the implementation based on implicit requirements, then include that category. Lean towards being inclusive rather than exclusive in category selection.**
    2.  Based on the selected categories, examine the instruction and code to generate 5 to 10 natural language constraints. These constraints should help an LLM generate code that is correct, complete, robust, and aligned with good development practices.
    **Crucially, each generated constraint must be specific, objective, and leave little room for subjective interpretation.** For example, instead of 'Write well-documented code,' specify 'Include docstrings for all functions and classes, detailing parameters, return values, and a brief purpose.' Each constraint should pose a tangible challenge for the model and be clearly verifiable. The aim is to create constraints that lead to a noticeable difference in model performance if adhered to, making them useful for differentiating models.
    **IMPORTANT: The 'type' field for each constraint in the 'Constraints' array MUST be one of the categories explicitly listed in the 'relevant_categories' array. There should be a perfect overlap.**",

    "JSON Response Format": {{
        "relevant_categories": [
            "List of selected relevant categories from 'all_constraint_categories'. Each string should be the exact name of the matching supercategory."
        ],
        "Constraints": [
            {{
                "type": "Constraint_Category_Name", // Must be one of the 'relevant_categories' you selected
                "constraint": "Specific, objective, and atomic constraint statement.",
                "instruction_part": "A small, relevant snippet from the original instruction that this constraint relates to, or 'Implicit' if not directly from instruction."
            }},
            // ... for each constraint
        ]
    }},

    "Inputs Required": {{
        "instruction": {instruction}
        "code": {code}
        "all_constraint_categories": {all_constraint_categories}

    }} """
    return prompt
prompt = get_prompt_constraint_generation_v2(df.iloc[0]["instruction"],df.iloc[0]["code"],categories)
print(prompt)


 "task": "You are an expert in generating **actionable, precise, and objective** constraints for code generation tasks by analyzing a natural language instruction and the associated code. Your primary objective is to first, identify the **most suitable and relevant constraint categories** from a provided comprehensive list based on the natural language instruction and associated code. Second, you will then generate meaningful constraints that guide a model to correctly implement or reason about the task in line with these *selected* categories. **Avoid generic or subjective constraints; each constraint must have a clear, verifiable requirement.**",

    "context": "You are provided with three inputs: (1) a natural language `instruction` describing a coding task, (2) the actual `code` implementing or partially implementing that instruction, and (3) a **comprehensive list of all available constraint categories** (all_constraint_categories). You must infer both explicit and implicit requi

In [119]:
response = get_response(prompt)
print(response)

```json
{
    "relevant_categories": [
        "Error Handling and Robustness",
        "Input and Output Handling",
        "Code Structure and Modularity"
    ],
    "Constraints": [
        {
            "type": "Error Handling and Robustness",
            "constraint": "Ensure that all methods in the Course class return None when the students list is empty.",
            "instruction_part": "Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0."
        },
        {
            "type": "Error Handling and Robustness",
            "constraint": "Implement checks in the average_gpa and best_student methods to handle cases where the students list is empty without throwing exceptions.",
            "instruction_part": "Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0."
        },
        {
            "type": "Input and Output Handling"

In [120]:
def generate_constraints(df,constraint_generation_method,output_pth,input_col1,input_col2):
    constraints = []
    categories = []
    for i, row in df.iterrows():
        print(f"Processing row {i}")
        prompt = constraint_generation_method(row[input_col1], row[input_col2], categories)
        response = get_response(prompt)
        constraint = extract_constraints(response,"Constraints")
        category = extract_constraints(response,"relevant_categories")
        constraints.append(constraint)
        categories.append(category)

    df["constraints"] = constraints
    df["relevant_categories"] = categories
    df.to_csv(output_pth, index=False)
    print("Saved the file successfully")
    return df

In [121]:
extract_constraints(response,"Constraints")

[{'type': 'Error Handling and Robustness',
  'constraint': 'Ensure that all methods in the Course class return None when the students list is empty.',
  'instruction_part': 'Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0.'},
 {'type': 'Error Handling and Robustness',
  'constraint': 'Implement checks in the average_gpa and best_student methods to handle cases where the students list is empty without throwing exceptions.',
  'instruction_part': 'Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0.'},
 {'type': 'Input and Output Handling',
  'constraint': 'Make sure that the average_gpa method correctly calculates the average GPA only when there are students present.',
  'instruction_part': 'Fix the methods in `Course` so that all of them never throw errors and return `None` if the length of their students list is 0.'},
 {'type': 'Inpu

In [122]:
extract_constraints(response,"relevant_categories")

['Error Handling and Robustness',
 'Input and Output Handling',
 'Code Structure and Modularity']

In [123]:
test_df2 = df.copy()
test_df2 = generate_constraints(test_df2,get_prompt_constraint_generation_v2,"step4_with_constraints_method_2.csv","instruction","code")
test_df2.head()

Processing row 52
Processing row 34
Processing row 29
Saved the file successfully


Unnamed: 0,dataset,instruction,code,relevant_categories,constraints
52,nuprl/CanItEdit,Fix the methods in `Course` so that all of the...,import functools\nimport numpy as np\nclass St...,"[Error Handling, Code Structure, Naming Conven...","[{'type': 'Error Handling', 'constraint': 'Ens..."
34,Multilingual-Multimodal-NLP/McEval-Instruct,Write a Python function `rotate_text` that tak...,"```python\ndef rotate_text(text, rotation):\n ...","[Error Handling, Code Structure]","[{'type': 'Error Handling', 'constraint': 'Imp..."
29,bigcode/bigcodebench,"Search for occurrences of the word ""error"" in ...",if not os.path.isdir(dir_path):\n raise...,"[Error Handling, Code Structure]","[{'type': 'Error Handling', 'constraint': 'The..."


In [124]:
test_df2 = measure_constraints(test_df2,"step5_with_quality_scores_m2.csv","instruction","code","constraints","quality_scores")
test_df2.head()

Processing row 52
Processing row 34
Processing row 29
Saved the file successfully


Unnamed: 0,dataset,instruction,code,relevant_categories,constraints,quality_scores,specificity_score,objectivity_score,atomicity_score,unified_quality_score
52,nuprl/CanItEdit,Fix the methods in `Course` so that all of the...,import functools\nimport numpy as np\nclass St...,"[Error Handling, Code Structure, Naming Conven...","[{'type': 'Error Handling', 'constraint': 'Ens...",{'constraint_evaluations': [{'constraint_text'...,4.0,4.5,4.67,4.39
34,Multilingual-Multimodal-NLP/McEval-Instruct,Write a Python function `rotate_text` that tak...,"```python\ndef rotate_text(text, rotation):\n ...","[Error Handling, Code Structure]","[{'type': 'Error Handling', 'constraint': 'Imp...",{'constraint_evaluations': [{'constraint_text'...,4.0,4.5,4.0,4.17
29,bigcode/bigcodebench,"Search for occurrences of the word ""error"" in ...",if not os.path.isdir(dir_path):\n raise...,"[Error Handling, Code Structure]","[{'type': 'Error Handling', 'constraint': 'The...",{'constraint_evaluations': [{'constraint_text'...,5.0,5.0,5.0,5.0


In [125]:
test_df2.columns

Index(['dataset', 'instruction', 'code', 'relevant_categories', 'constraints',
       'quality_scores', 'specificity_score', 'objectivity_score',
       'atomicity_score', 'unified_quality_score'],
      dtype='object')

In [126]:
import pandas as pd

def compare_constraint_generation_methods_from_dfs(df1: pd.DataFrame, df2: pd.DataFrame):

    print(f"--- Comparing Constraint Generation Methods ---")

    if df1.empty or df2.empty:
        print("Error: One or both DataFrames are empty. Cannot perform comparison.")
        return

    # Ensure required columns are present (basic check)
    required_cols = ['specificity_score', 'objectivity_score','atomicity_score', 'unified_quality_score']
    if not all(col in df1.columns for col in required_cols) or \
       not all(col in df2.columns for col in required_cols):
        print("Error: Input DataFrames must contain 'unified_quality_score', 'avg_specificity', 'avg_objectivity', and 'avg_atomicity' columns.")
        return

    print("\n--- Average Scores per Method ---")
    print("Method 1 Averages:")
    print(df1[required_cols].mean().round(2))
    print("\nMethod 2 Averages:")
    print(df2[required_cols].mean().round(2))

    print("\n--- Statistical Comparison (Difference in Averages) ---")
    # Using .values to ensure consistent order for difference calculation
    comparison_data = {
        'Metric': ['Specificity', 'Objectivity', 'Atomicity', 'Unified Quality'],
        'Method 1 Avg': df1[required_cols].mean().values.round(2),
        'Method 2 Avg': df2[required_cols].mean().values.round(2)
    }
    comparison_df = pd.DataFrame(comparison_data)
    # Calculate difference separately to ensure order
    comparison_df['Difference (Method 2 - Method 1)'] = (comparison_df['Method 2 Avg'] - comparison_df['Method 1 Avg']).round(2)

    print(comparison_df.to_string(index=False))
    comparison_df.to_csv(f"comparison_results_N{N}.csv", index=False)
    print("\n--- Recommendation ---")
    # Ensure means are compared for the specific unified_quality_score
    unified_mean_df1 = df1['unified_quality_score'].mean()
    unified_mean_df2 = df2['unified_quality_score'].mean()

    if unified_mean_df2 > unified_mean_df1:
        print("Based on the Unified Quality Score, Method 2 appears to perform better.")
        # Check if Method 2 is better across ALL individual metrics
        if (df2['specificity_score'].mean() > df1['specificity_score'].mean() and
            df2['objectivity_score'].mean() > df1['objectivity_score'].mean() and
            df2['atomicity_score'].mean() > df1['atomicity_score'].mean()):
            print("Method 2 also shows higher averages across all individual quality metrics (specificity, objectivity, atomicity).")
    elif unified_mean_df1 > unified_mean_df2:
        print("Based on the Unified Quality Score, Method 1 appears to perform better.")
        # Check if Method 1 is better across ALL individual metrics
        if (df1['specificity_score'].mean() > df2['specificity_score'].mean() and
            df1['objectivity_score'].mean() > df2['objectivity_score'].mean() and
            df1['atomicity_score'].mean() > df2['atomicity_score'].mean()):
            print("Method 1 also shows higher averages across all individual quality metrics (specificity, objectivity, atomicity).")
    else:
        print("Both methods yield very similar Unified Quality Scores. You might need to perform a deeper qualitative analysis or consider other factors (e.g., cost, generation speed) to decide.")



test_df = pd.read_csv("step5_with_quality_scores_m1.csv")
test_df2 = pd.read_csv("step5_with_quality_scores_m2.csv")

compare_constraint_generation_methods_from_dfs(test_df, test_df2)

--- Comparing Constraint Generation Methods ---

--- Average Scores per Method ---
Method 1 Averages:
specificity_score        4.43
objectivity_score        4.61
atomicity_score          4.28
unified_quality_score    4.44
dtype: float64

Method 2 Averages:
specificity_score        4.33
objectivity_score        4.67
atomicity_score          4.56
unified_quality_score    4.52
dtype: float64

--- Statistical Comparison (Difference in Averages) ---
         Metric  Method 1 Avg  Method 2 Avg  Difference (Method 2 - Method 1)
    Specificity          4.43          4.33                             -0.10
    Objectivity          4.61          4.67                              0.06
      Atomicity          4.28          4.56                              0.28
Unified Quality          4.44          4.52                              0.08

--- Recommendation ---
Based on the Unified Quality Score, Method 2 appears to perform better.


# Step 4 - Compare New Constraints with Previous Ones for Quality Improvement
- Skipping for now

# Step 6 - Generate constraints for each row in the benchmark dataset.


# Step 8 - Generate code from Instruction and Generated Constraints

In [127]:
def get_codegeneration_prompt(instruction,constraints):
    prompt = f""" You are a skilled Python programmer. Based on the following natural language instruction and a set of implementation constraints, generate Python code that satisfies the instruction and fully adheres to all constraints.

    ### Instruction:
    {instruction}

    ### Constraints:
    {constraints}

    ### Requirements:
    - Ensure the code is clean, correct, and follows Python best practices.
    - Strictly follow all the constraints, even if they are not explicitly stated in the instruction.
    - Do not include any explanatory text; return only the code block.

    ### Output Format:
    Return a single Python code block that solves the task.


    # Your code here


    """
    return prompt


print(get_codegeneration_prompt(test_df2.iloc[0,0],test_df2.iloc[0,6]))

 You are a skilled Python programmer. Based on the following natural language instruction and a set of implementation constraints, generate Python code that satisfies the instruction and fully adheres to all constraints.

    ### Instruction:
    nuprl/CanItEdit

    ### Constraints:
    4.0

    ### Requirements:
    - Ensure the code is clean, correct, and follows Python best practices.
    - Strictly follow all the constraints, even if they are not explicitly stated in the instruction.
    - Do not include any explanatory text; return only the code block.

    ### Output Format:
    Return a single Python code block that solves the task.


    # Your code here


    


In [128]:
prompt = get_codegeneration_prompt(test_df2.iloc[0,0],test_df2.iloc[0,6])
code = get_response(prompt)
print(code)

```python
def can_edit_nuprl(input_string):
    # Check if the input string is editable based on some criteria
    # For demonstration, let's assume we can edit if the string is not empty
    return bool(input_string)

# Example usage
if __name__ == "__main__":
    test_string = "Edit this string"
    print(can_edit_nuprl(test_string))  # Output: True
```


In [None]:
def generate_code(row,input_col1="instruction",input_col2="extract_constraints"):
    print(row[input_col1][:10])
    prompt = get_codegeneration_prompt(row[input_col1],row[input_col2])
    code = get_response(prompt)

    return code
generate_code(test_df2.iloc[0])



In the c


'```python\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\n\nclass Options:\n    def __init__(self, lr_G, lr_D, lr_M, beta1, beta2, input_nc, output_nc):\n        self.lr_G = lr_G\n        self.lr_D = lr_D\n        self.lr_M = lr_M\n        self.beta1 = beta1\n        self.beta2 = beta2\n        self.input_nc = input_nc\n        self.output_nc = output_nc\n\nclass Generator(nn.Module):\n    def __init__(self, input_nc, output_nc):\n        super(Generator, self).__init__()\n        # Define generator architecture here\n\n    def forward(self, x):\n        # Define forward pass\n        return x\n\nclass Discriminator(nn.Module):\n    def __init__(self, input_nc):\n        super(Discriminator, self).__init__()\n        # Define discriminator architecture here\n\n    def forward(self, x):\n        # Define forward pass\n        return x\n\ndef initialize_cyclegan_components(options):\n    components = {}\n    \n    try:\n        # Initialize generators\n        G_AB = 

In [131]:
test_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   dataset                3 non-null      object 
 1   instruction            3 non-null      object 
 2   code                   3 non-null      object 
 3   relevant_categories    3 non-null      object 
 4   constraints            3 non-null      object 
 5   quality_scores         3 non-null      object 
 6   specificity_score      3 non-null      float64
 7   objectivity_score      3 non-null      float64
 8   atomicity_score        3 non-null      float64
 9   unified_quality_score  3 non-null      float64
dtypes: float64(4), object(6)
memory usage: 372.0+ bytes


In [132]:
from tqdm import tqdm
import pandas as pd

# Enable tqdm for pandas apply
tqdm.pandas()
test_df2["generated_code"] = test_df2.apply(generate_code,input_col2="constraints",axis=1)

Fix the me
Write a Py
Search for


In [None]:
test_df2.head()

In [None]:
test_df2.to_csv("Mceval_generated_code.csv",index=False)

In [None]:
def generate_code_without_constraints(row,input_col1="instruction"):
    print(row[input_col1][:10])
    prompt = f"""You are a skilled Python programmer. Based on the following natural language instruction, generate Python code that satisfies the instruction.

    ### Instruction:
    {row[input_col1]}

    ### Requirements:
    - Ensure the code is clean, correct, and follows Python best practices.
    - Do not include any explanatory text; return only the code block.

    ### Output Format:
    Return a single Python code block that solves the task.


    # Your code here


    """
    code = get_response(prompt)

    return code

code1= generate_code_without_constraints(test_df2.iloc[0])

In [None]:
print(code1)

In [None]:
code2 = generate_code(test_df2.iloc[0])
print(code2)

In [None]:
from IPython.display import Markdown, display
def md(text):
    display(Markdown(text))
md(code1)


In [None]:
md(code2)

# Step 9 - Evaluate Generated Code and Calculate Metrics

In [None]:
import difflib

code1_lines = code1.strip().splitlines()
code2_lines = code2.strip().splitlines()

# Get the diff
diff = difflib.unified_diff(code1_lines, code2_lines, fromfile='code1.py', tofile='code2.py', lineterm='')

# Print the diff
print("\n".join(diff))
