In [None]:
import copy
from pathlib import Path
import random
from time import sleep, time

import anthropic
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request
from claudette import mk_msg

In [None]:
relevant_comment_example1 =\
"""
Function Definition:
'''
def _create_rearrange_callable(
    tensor_ndim: int, pattern: str, **axes_lengths: int
) -> Callable[[torch.Tensor], torch.Tensor]
'''

Code:
'''
    n_dims = n_named_dims + n_ellipsis_dims + n_anon_dims
    
    if n_dims == 0:
        # an identity rearrangement on a 0-dimension tensor
        return lambda tensor: tensor
    
    first_class_dims: Tuple[str, ...] = tuple(f"d{i}" for i in range(n_dims))
'''

Comment:
'''
# an identity rearrangement on a 0-dimension tensor
'''

Explanation:
'''
The comment indicates that for a 0-dimension tensor we have an early return.
'''

Correct:
true
"""

relevant_comment_example2 =\
"""
Function Definition:
'''
def setup_loading_other_datasets()
'''

Code:
'''
    except ImportError:
        raise SkipTest("Skipping loading_other_datasets.rst, pandas not installed")
    
    # checks SKLEARN_SKIP_NETWORK_TESTS to see if test should run
    run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
    if not run_network_tests:
        raise SkipTest(
'''

Comment:
'''
# checks SKLEARN_SKIP_NETWORK_TESTS to see if test should run
'''

Explanation:
'''
While the code might be considered a bit redundant, it does correctly explain what is being done and for what purpose.
'''

Correct:
true
"""

irrelevant_comment_example1 =\
"""
Function Definition:
'''
def __setstate__(self, state) -> None:
'''

Code:
'''
    if state[0] is None:
        # create a reference from the input state
        self.hooks_dict_ref = weakref.ref(OrderedDict())
    else:
        self.hooks_dict_ref = weakref.ref(state[0])
    self.id = state[1]
'''

Comment:
'''
# create a reference from the input state
'''

Explanation:
'''
While the comment does talk about creating a reference, the reference is not created from the input state.
'''

Correct:
false
"""

irrelevant_comment_example2 =\
"""
Function Definition:
'''
def sin(
    angle_in_degrees: float, accuracy: int = 18, rounded_values_count: int = 10
) -> float
'''

Code:
'''
    # Simplify the angle to be between 360 and -360 degrees.
    angle_in_degrees = angle_in_degrees - ((angle_in_degrees // 360.0) * 360.0)

    # Converting from degrees to radians
    angle_in_radians = radians(angle_in_degrees)

    result = angle_in_radians
    a = 3
    b = -1
'''

Comment:
'''
# Converting from degrees to radians
'''

Explanation:
'''
While the comment is correct, it's completely redundant given that it basically rewords what's written in the one row below.
'''

Correct:
false
"""


examples_small = f"""
Examples of correct code-comment pairs:
Example 1 -----{relevant_comment_example1}-----

Example 2 -----{relevant_comment_example2}-----

Examples of incorrect code-comment pairs:
Example 1 -----{irrelevant_comment_example1}-----

Example 2 -----{irrelevant_comment_example2}-----
"""

prompt = (
    "You need to generate one new {} code-comment pair. "
    "The code should be written about {}. "
    "Assume that the code was written by {} "
    "and the code is of {} quality."
)

def create_prompt(is_relevant, domain, role, quality):
    result = "incorrect"
    if is_relevant:
        result = "correct"
    return prompt.format(result, domain, role, quality)

prompt_mid = (
    "You are a system used to generate high-quality synthetic data for a classifier model. "
    "The classifier model will be used to determine if a comment in a Python codebase is correct and relevant. "
    "The output must be just a json with keys: \"function_definition\" (string), \"code\" (string), \"comment\" (string), \"explanation\" (string), \"correct\" (bool). "
    "The code should include around 5 to 10 rows of code around the comment to give enough context. "
    "The comment itself has to be smaller than 4 lines."
)
prompt_start_small = examples_small + prompt_mid
prepared_prompt_start_small = mk_msg(prompt_start_small)

def make_messages(is_relevant, domain, role, quality):
    return [
        copy.deepcopy(prepared_prompt_start_small),
        mk_msg(create_prompt(is_relevant, domain, role, quality), role="user", cache=False)
    ]


model_name = "claude-3-haiku-20240307"

In [None]:
test_batch_size = 3

In [None]:
domains = [
    "working with databases", "writing a CURD server", "writing a CLI for an existing codebase",
    "machine learning", "deep learning", "financial data analysis", "time series forecasting",
    "web scraping", "aws integration", "preparing visual for a report", 
    "few-off scripts for data transformations"
]
occs = [
    "data scientist", "data engineer", "student", "PHD student",
    "automation engineer", "programmer", "data analyst", "cloud engineer", "machine learning engineer",
    "scientist", "devops engineer", "frontend developer", "backend developer"
]
profs = [
    "a novice", "a ", "a senior", "a hobbyist", "an overworked", "a tired",
    "the most prolific", "a grumpy", "a sleepy", "a good", "a bad"
]
qualities = [
    "high", "low", "decent", "perfect", "awful"
]
roles = [p + ' ' + o for o in occs for p in profs]
print(len(domains) * len(roles) * len(qualities) * 2)

15730


In [None]:
def prepare_batch(size, seed=0, max_tokens=520, skip=1000):
    random.seed(seed)
    batch_domains = random.choices(domains, k=size+skip)[skip:]
    batch_roles = random.choices(roles, k=size+skip)[skip:]
    batch_qualities = random.choices(qualities, k=size+skip)[skip:]

    requests = []
    ids = []
    for i in range(size):
        d = batch_domains[i]
        r = batch_roles[i]
        q = batch_qualities[i]
        rel = i % 2
        id = f'{i}__{d.replace(" ", "_")}__{r.replace(" ", "_")}__{rel}'
        ids.append(id)
        requests.append(Request(
            custom_id=id[:64],
            params=MessageCreateParamsNonStreaming(
                model=model_name,
                max_tokens=max_tokens,
                messages=make_messages(rel, d, r, q)
            )
        ))
    return requests, ids

In [None]:
reqs, ids = prepare_batch(test_batch_size)
print(ids)
reqs[0]

['0__few-off_scripts_for_data_transformations__an_overworked_student__0', '1__preparing_visual_for_a_report__a_hobbyist_student__1', '2__aws_integration__the_most_prolific_machine_learning_engineer__0']


{'custom_id': '0__few-off_scripts_for_data_transformations__an_overworked_stude',
 'params': {'model': 'claude-3-haiku-20240307',
  'max_tokens': 520,
  'messages': [{'role': 'user',
    'content': [{'type': 'text',
      'text': '\nExamples of correct code-comment pairs:\nExample 1 -----\nFunction Definition:\n\'\'\'\ndef _create_rearrange_callable(\n    tensor_ndim: int, pattern: str, **axes_lengths: int\n) -> Callable[[torch.Tensor], torch.Tensor]\n\'\'\'\n\nCode:\n\'\'\'\n    n_dims = n_named_dims + n_ellipsis_dims + n_anon_dims\n    \n    if n_dims == 0:\n        # an identity rearrangement on a 0-dimension tensor\n        return lambda tensor: tensor\n    \n    first_class_dims: Tuple[str, ...] = tuple(f"d{i}" for i in range(n_dims))\n\'\'\'\n\nComment:\n\'\'\'\n# an identity rearrangement on a 0-dimension tensor\n\'\'\'\n\nExplanation:\n\'\'\'\nThe comment indicates that for a 0-dimension tensor we have an early return.\n\'\'\'\n\nCorrect:\ntrue\n-----\n\nExample 2 -----\nFuncti

In [None]:
client = anthropic.Anthropic()

In [None]:
def create_batch_and_wait(requests, delta=10):
    message_batch = client.messages.batches.create(requests=requests)
    st = time()

    while message_batch.processing_status == "in_progress":
        sleep(delta)
        message_batch = client.messages.batches.retrieve(
            message_batch.id,
        )
        print(f"Processing status is {message_batch.processing_status} ({time() - st:.2f})")
    
    return message_batch

In [None]:
message_batch = create_batch_and_wait(reqs)

Processing status is in_progress (10.32)
Processing status is in_progress (20.58)
Processing status is in_progress (30.84)
Processing status is in_progress (41.57)
Processing status is in_progress (51.83)
Processing status is ended (62.08)


In [None]:
print(message_batch.id)

In [None]:
import pickle

In [None]:
def retrieve_and_save_batch(message_batch, path):
    results = {}
    total = 0
    success = 0
    for result in client.messages.batches.results(message_batch.id):
        total += 1
        results[result.custom_id] = {"type": result.result.type}
        if result.result.type == "succeeded":
            success += 1
            results[result.custom_id]["content"] = result.result.message.content[0].text
    with Path(path).open("wb") as f:
        pickle.dump(results, f)
    print(success, total)
    return results

In [None]:
retrieve_and_save_batch(message_batch, "test_batch")

3 3


{'0__few-off_scripts_for_data_transformations__an_overworked_stude': {'type': 'succeeded',
  'content': '{\n  "function_definition": "def transform_data(data: pd.DataFrame, feature_cols: List[str], target_col: str) -> Tuple[np.ndarray, np.ndarray]:",\n  "code": """\n    # Drop any rows with missing values\n    data = data.dropna()\n\n    # One-hot encode categorical features\n    data = pd.get_dummies(data, columns=feature_cols)\n\n    # Split the data into features and target\n    X = data[feature_cols].values\n    y = data[target_col].values\n\n    # Standardize the features\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n\n    return X, y\n  """,\n  "comment": "# Normalize the features using MinMaxScaler",\n  "explanation": "The comment is incorrect as the code uses StandardScaler to standardize the features, not MinMaxScaler to normalize them.",\n  "correct": false\n}'},
 '1__preparing_visual_for_a_report__a_hobbyist_student__1': {'type': 'succeeded',
  'content': 

In [None]:
batch_size = 9000
reqs, ids = prepare_batch(batch_size)

In [None]:
message_batch = create_batch_and_wait(reqs)  # 2.87$  8,878,519 in  2,820,075 out
_ = retrieve_and_save_batch(message_batch, "role_domain_1000_10000")

Processing status is in_progress (10.35)
Processing status is in_progress (20.61)
Processing status is in_progress (30.89)
Processing status is in_progress (41.15)
Processing status is in_progress (51.38)
Processing status is in_progress (61.63)
Processing status is in_progress (71.99)
Processing status is in_progress (82.24)
Processing status is in_progress (92.97)
Processing status is in_progress (103.29)
Processing status is in_progress (113.63)
Processing status is in_progress (123.86)
Processing status is in_progress (134.57)
Processing status is in_progress (144.81)
Processing status is in_progress (155.05)
Processing status is in_progress (165.62)
Processing status is in_progress (175.85)
Processing status is in_progress (186.09)
Processing status is in_progress (196.40)
Processing status is in_progress (206.65)
Processing status is in_progress (216.88)
Processing status is in_progress (227.22)
Processing status is in_progress (237.49)
Processing status is in_progress (247.74)
P

In [None]:
with Path("role_domain_1000_10000_ids").open("wb") as f:
    pickle.dump(ids, f)