In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"
device = "cuda"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [None]:
messages = [
    {"role": "system", "content": "You are a question answering systems that gives only silly and short answers."},
    {"role": "user", "content": "What is gravity?"}
]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)

<|im_start|>system
You are a question answering systems that gives only silly and short answers.<|im_end|>
<|im_start|>user
What is gravity?<|im_end|>



In [None]:
input_tokens = tokenizer.encode(input_text, return_tensors="pt").to(device)
input_tokens

tensor([[    1,  9690,   198,  2683,   359,   253,  1962, 16522,  1734,   338,
          3894,   805, 22933,   284,  1890,  5360,    30,     2,   198,     1,
          4093,   198,  1780,   314,  9354,    47,     2,   198]],
       device='cuda:0')

In [None]:
output_tokens = model.generate(input_tokens, max_new_tokens=50, do_sample=False)
output_tokens

tensor([[    1,  9690,   198,  2683,   359,   253,  1962, 16522,  1734,   338,
          3894,   805, 22933,   284,  1890,  5360,    30,     2,   198,     1,
          4093,   198,  1780,   314,  9354,    47,     2,   198,     1,   520,
          9531,   198, 22007,  6463,   314,   253,  3075,   338, 26648,  3401,
          2258,   971,   550,    28,  4439,   601,   288,  2332,  2258,   260,
          2118,    30,   657,   506,   253,  4959,  1909,   281,  6944,   338,
          2311,  3556,  1701,  3401,  1485,   284,  1701,  9354,   314,   732,
           357,   314,    30,     2]], device='cuda:0')

In [None]:
output_text = tokenizer.decode(output_tokens[0])
print(output_text)

<|im_start|>system
You are a question answering systems that gives only silly and short answers.<|im_end|>
<|im_start|>user
What is gravity?<|im_end|>
<|im_start|>assistant
Gravity is a force that pulls objects towards each other, causing them to fall towards the ground. It's a fundamental concept in physics that helps explain why objects move and why gravity is what it is.<|im_end|>


In [None]:
def generate(messages, model=model, tokenizer=tokenizer, **kwargs):
    input_tokens = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
    output_tokens = model.generate(input_tokens, **kwargs)
    output_text = tokenizer.batch_decode(output_tokens)
    return output_text

In [None]:
generate(messages, max_new_tokens=50, do_sample=False)

["<|im_start|>system\nYou are a question answering systems that gives only silly and short answers.<|im_end|>\n<|im_start|>user\nWhat is gravity?<|im_end|>\n<|im_start|>assistant\nGravity is a force that pulls objects towards each other, causing them to fall towards the ground. It's a fundamental concept in physics that helps explain why objects move and why gravity is what it is.<|im_end|>"]

In [None]:
generate(
    [
        {"role": "system", "content": "You are a question answering systems that gives only really silly and short answers. The answers have to be silly."},
        {"role": "user", "content": "What is gravity?"}
    ],
    max_new_tokens=50, do_sample=False
)

["<|im_start|>system\nYou are a question answering systems that gives only really silly and short answers. The answers have to be silly.<|im_end|>\n<|im_start|>user\nWhat is gravity?<|im_end|>\n<|im_start|>assistant\nGravity is the force that keeps us from floating away from the Earth. It's a bit like the glue that holds the Earth together, but it's not a force that can be used to move objects. It's a fundamental"]

In [None]:
generate(
    [
        {"role": "system", "content": "You are a question answering systems that gives only really silly and short answers. The answers have to be silly."},
        {"role": "user", "content": "What is gravity?"}
    ],
    max_new_tokens=500, do_sample=False
)

["<|im_start|>system\nYou are a question answering systems that gives only really silly and short answers. The answers have to be silly.<|im_end|>\n<|im_start|>user\nWhat is gravity?<|im_end|>\n<|im_start|>assistant\nGravity is the force that keeps us from floating away from the Earth. It's a bit like the glue that holds the Earth together, but it's not a force that can be used to move objects. It's a fundamental force that keeps everything in the universe from flying away.<|im_end|>"]

In [None]:
relevant_comment_example =\
"""
Function Definition:
'''
def _create_rearrange_callable(
    tensor_ndim: int, pattern: str, **axes_lengths: int
) -> Callable[[torch.Tensor], torch.Tensor]
'''

Code:
'''
n_dims = n_named_dims + n_ellipsis_dims + n_anon_dims

if n_dims == 0:
    # an identity rearrangement on a 0-dimension tensor
    return lambda tensor: tensor

first_class_dims: Tuple[str, ...] = tuple(f"d{i}" for i in range(n_dims))
'''

Comment:
'''
# an identity rearrangement on a 0-dimension tensor
'''

Explanation:
'''
The comment indicates that for a 0-dimension tensor we have an early return.
'''

Correct:
true
"""

irrelevant_comment_example =\
"""
Function Definition:
'''
def __setstate__(self, state) -> None:
'''

Code:
'''
if state[0] is None:
    # create a reference from the input state
    self.hooks_dict_ref = weakref.ref(OrderedDict())
else:
    self.hooks_dict_ref = weakref.ref(state[0])
self.id = state[1]
'''

Comment:
'''
# create a reference from the input state
'''

Explanation:
'''
While the comment does talk about creating a reference, the reference is not created from the input state.
'''

Correct:
false
"""

In [None]:
prompt =\
"""
Example of a correct code-comment pair:
-----{}-----

Example of an incorrect code-comment pair:
-----{}-----

You need to generate one new {} code-comment pair. The code should be written in Python and it has to be about {}. Assume that the code was written by {}. Include around 5 to 10 rows of code around the code to give enough context. The comment itself has to be smaller than 3 lines.
Generate the output in a json format with keys: \"function_definition\" (string), \"code\" (string), \"comment\" (string), \"explanation\" (string), \"correct\" (bool).

OUTPUT:
"""

print(prompt.format(relevant_comment_example, irrelevant_comment_example, "incorrect", "working with databases", "a novice data scientist"))


Example of a correct code-comment pair:
-----
Function Definition:
'''
def _create_rearrange_callable(
    tensor_ndim: int, pattern: str, **axes_lengths: int
) -> Callable[[torch.Tensor], torch.Tensor]
'''

Code:
'''
n_dims = n_named_dims + n_ellipsis_dims + n_anon_dims

if n_dims == 0:
    # an identity rearrangement on a 0-dimension tensor
    return lambda tensor: tensor

first_class_dims: Tuple[str, ...] = tuple(f"d{i}" for i in range(n_dims))
'''

Comment:
'''
# an identity rearrangement on a 0-dimension tensor
'''

Explanation:
'''
The comment indicates that for a 0-dimension tensor we have an early return.
'''

Correct:
true
-----

Example of an incorrect code-comment pair:
-----
Function Definition:
'''
def __setstate__(self, state) -> None:
'''

Code:
'''
if state[0] is None:
    # create a reference from the input state
    self.hooks_dict_ref = weakref.ref(OrderedDict())
else:
    self.hooks_dict_ref = weakref.ref(state[0])
self.id = state[1]
'''

Comment:
'''
# create a r

In [None]:
messages = [
    {"role": "system", "content": "You are a system used to generate high-quality synthetic data for a classifier model. The classifier model will be used to determine if a comment in a codebase is correct and relevant."},
    {"role": "user", "content": prompt}
]
output = generate(
    messages,
    max_new_tokens=500, do_sample=False
)

In [None]:
answer = output[0]
start_string = "<|im_start|>assistant"
answer = answer[answer.find(start_string) + len(start_string):]
print(answer)


```python
def function_definition(name):
    """
    Function definition for the function "function_definition".

    Args:
        name (str): The name of the function.

    Returns:
        str: The function definition.
    """
    return f"def {name}():\n"

def code(name):
    """
    Code for the function "code".

    Args:
        name (str): The name of the function.

    Returns:
        str: The code.
    """
    return f"def {name}():\n"

def comment(name):
    """
    Comment for the function "comment".

    Args:
        name (str): The name of the function.

    Returns:
        str: The comment.
    """
    return f"// Comment for the function {name}."

def explanation(name):
    """
    Explanation for the function "explanation".

    Args:
        name (str): The name of the function.

    Returns:
        str: The explanation.
    """
    return f"// Explanation for the function {name}."

def correct(name):
    """
    Correct for the function "correct".

    Args:
   

In [None]:
# chatgpt 4o mini
o4_out = {
  "function_definition": "def fetch_data_from_db(query: str) -> List[Dict[str, Any]]:",
  "code": "cursor = connection.cursor()\ncursor.execute(query)\nrows = cursor.fetchall()\nresult = []\nfor row in rows:\n    result.append(row)\nreturn result",
  "comment": "# fetch data from the database",
  "explanation": "The comment suggests the function is fetching data, but it doesn't mention that the function also processes the data into a list of dictionaries before returning it.",
  "correct": False
}

print(o4_out["comment"])
print("----")
print(o4_out["code"])
print("----")
print(o4_out["explanation"])

# fetch data from the database
----
cursor = connection.cursor()
cursor.execute(query)
rows = cursor.fetchall()
result = []
for row in rows:
    result.append(row)
return result
----
The comment suggests the function is fetching data, but it doesn't mention that the function also processes the data into a list of dictionaries before returning it.


In [None]:
# cluade 3.5 sonnet
claude_35_out = {
    "function_definition": "def update_customer_records(db_connection: sqlite3.Connection, customer_data: List[Dict]) -> None:",
    "code": """
def update_customer_records(db_connection: sqlite3.Connection, customer_data: List[Dict]) -> None:
    cursor = db_connection.cursor()
    # Update customer information in database
    for customer in customer_data:
        query = f"UPDATE customers SET email = '{customer['email']}' WHERE id = {customer['id']}"
        cursor.execute(query)
    
    db_connection.commit()
    cursor.close()
""",
    "comment": "# Update customer information in database",
    "explanation": "The comment is incorrect because it's too vague and misses the critical security issue - the code is vulnerable to SQL injection attacks by directly formatting strings into the query. The comment should warn about this security risk rather than just stating the obvious action being performed.",
    "correct": False
}
print(claude_35_out["comment"])
print("----")
print(claude_35_out["code"])
print("----")
print(claude_35_out["explanation"])

# Update customer information in database
----

def update_customer_records(db_connection: sqlite3.Connection, customer_data: List[Dict]) -> None:
    cursor = db_connection.cursor()
    # Update customer information in database
    for customer in customer_data:
        query = f"UPDATE customers SET email = '{customer['email']}' WHERE id = {customer['id']}"
        cursor.execute(query)
    
    db_connection.commit()
    cursor.close()

----
The comment is incorrect because it's too vague and misses the critical security issue - the code is vulnerable to SQL injection attacks by directly formatting strings into the query. The comment should warn about this security risk rather than just stating the obvious action being performed.


In [None]:
messages = [
    {"role": "system", "content":
     (
         "You are a system used to generate high-quality synthetic data for a classifier model ."
         "The classifier model will be used to determine if a comment in a codebase is correct and relevant. Your task is "
         "to generate data samples based on the user prompt taking into account the examples above. Make sure that the output in concise."
     )
     },
    {"role": "user", "content": prompt}
]
output = generate(
    messages,
    max_new_tokens=500,
    do_sample=True, temperature=0.3, top_p=0.8
)

answer = output[0]
start_string = "<|im_start|>assistant"
answer = answer[answer.find(start_string) + len(start_string):]
print(answer)


```python
def function_definition(name):
    """
    This function defines a function with the name "function_definition".

    Args:
        name (str): The name of the function.

    Returns:
        str: The function definition.

    Raises:
        ValueError: If the name is not a string.
    """
    return f"This function defines a function with the name '{name}'."

def code(name):
    """
    This function defines a function with the name "code".

    Args:
        name (str): The name of the function.

    Returns:
        str: The function definition.

    Raises:
        ValueError: If the name is not a string.
    """
    return f"This function defines a function with the name '{name}'."

def comment(name):
    """
    This function defines a comment with the name "comment".

    Args:
        name (str): The name of the comment.

    Returns:
        str: The comment.

    Raises:
        ValueError: If the name is not a string.
    """
    return f"This function defines a 