# Step by Step Execution Reasoning Dataset

In [41]:
import re

complex_patterns = [
    r"<class '.*'>",                     # Matches class type representations
    r"<[^>]+object at 0x[\da-f]+>",       # Matches <SomeClass object at 0x123abc>
    r"at 0x[\da-f]+",                     # Matches memory addresses (useful for nested cases)
    r"^\s*<",                             # Matches lines that start with "<"
    r"<\s*>",                             # Matches empty brackets like < >
    r"<[^>]+>",                           # Generalized to match any <...> objects
]


def line_insertion(lines, assertion_line_mapping):
    #sorting the assertion lines and inserting them in descending order
    sorted_line_mapping = sorted(assertion_line_mapping, key = lambda x:x["line_no"], reverse= True)

    for dict in sorted_line_mapping:
        lines.insert(dict['line_no'], dict['assertion'])
    
    return lines


def step_by_step_assertion(lines:list, states:list):
    """
    Takes the sample code as list and the state lines numbers, returns the sample code with proper assertions
    """

    assertion_line_mapping = []   # Assertion line number to line mapping

    for line in states:
        match = re.search(r'\[STATE\](.*?)\[/STATE\]', lines[line])
        flag = 0
        if match:
            ground_truth_line = match.group(1)
        else:
            ground_truth_line = None

        lines[line] = lines[line].split("# [STATE]")[0]  #Removing state variables from the selected line

        # Safely split the ground truth line on the first "="
        if ground_truth_line:
            key_value_split = ground_truth_line.split("=", 1)
            if len(key_value_split) == 2:
                key, value = key_value_split[0].strip(), key_value_split[1].strip()

                indentation = re.match(r"^\s*", lines[line]).group()  #matching indentation for the assertion line, should be same as the selected line

                for pattern in complex_patterns:
                    if re.search(pattern, str(value)):
                        flag = 1
                        break
                if flag:
                    continue
                #Let's map line number to the assertion line
                print(value)
                assertion_line_mapping.append({
                    "line_no": line+1,
                    "assertion": f"{indentation}assert {key} == ??",
                    "actual_assertion": f"assert {key} == {value}",
                    "value": value,
                })

            else:
                continue
        else:
            continue
    
    lines = line_insertion(lines, assertion_line_mapping)

    return lines, assertion_line_mapping



In [42]:
""""
This script will help to curate the dataset for step-by-step evaluation of LLMs
    - Let's select all the <STATE> variables of the program
    - We don't want to select codes having only one <STATE> line
    - Do further filterting not to select <STATE> variables having a class object, and 
    having multiple <STATE> changes in a single line (for i.e, <STATE> variable1 </STATE> <STATE> variable2 </STATE>) 
"""

def process_code(code):
    lines = code.split("\n")
    state_lines = []

    for idx, line in enumerate(lines):
        if "# [STATE]" in line:
            state_lines.append(idx)

    if len(state_lines) == 1:
        return None, None


    if state_lines:
        lines, assertion_mapping = step_by_step_assertion(lines, state_lines)
    else:
        return None, None

    #print(lines[-3])
    # Join the modified code
    modified_code = "\n".join([line for i, line in enumerate(lines) if line.strip() and i != len(lines) - 3])

    modified_code = "\n".join(modified_code.split("\n")[1:])

    return modified_code, assertion_mapping


# Example Usage
input_code = "# <INPUT> [0, 100, 200, 0, 0, 0, 0, 0, 0, 0], 2, 1 </INPUT>\ndef exchange(a, i, j):\n    temp = a[i] # [STATE] temp = 200 [/STATE]\n    a[i] = a[j] # [STATE] a = [0, 100, 100, 0, 0, 0, 0, 0, 0, 0] [/STATE]\n    a[j] = temp # [STATE] a = [0, 200, 100, 0, 0, 0, 0, 0, 0, 0] [/STATE]\n# <OUTPUT> None </OUTPUT>\n\nexchange([0, 100, 200, 0, 0, 0, 0, 0, 0, 0], 2, 1)"

formatted_code, assertion_mapping = process_code(input_code)
print("Formatted Code:")
print(formatted_code)
print("\nAssertion Mapping:")
print(assertion_mapping)


200
[0, 100, 100, 0, 0, 0, 0, 0, 0, 0]
[0, 200, 100, 0, 0, 0, 0, 0, 0, 0]
Formatted Code:
def exchange(a, i, j):
    temp = a[i] 
    assert temp == ??
    a[i] = a[j] 
    assert a == ??
    a[j] = temp 
    assert a == ??
exchange([0, 100, 200, 0, 0, 0, 0, 0, 0, 0], 2, 1)

Assertion Mapping:
[{'line_no': 3, 'assertion': '    assert temp == ??', 'actual_assertion': 'assert temp == 200', 'value': '200'}, {'line_no': 4, 'assertion': '    assert a == ??', 'actual_assertion': 'assert a == [0, 100, 100, 0, 0, 0, 0, 0, 0, 0]', 'value': '[0, 100, 100, 0, 0, 0, 0, 0, 0, 0]'}, {'line_no': 5, 'assertion': '    assert a == ??', 'actual_assertion': 'assert a == [0, 200, 100, 0, 0, 0, 0, 0, 0, 0]', 'value': '[0, 200, 100, 0, 0, 0, 0, 0, 0, 0]'}]


In [43]:
import json

source_code_set = set()

def generate_assertion_line_value(assertion_mapping):
    assertion_line = []
    assertion_value = []

    for dict in assertion_mapping:
        for key, value in dict.items():
            if key == 'actual_assertion':
                assertion_line.append(dict[key])
            if key == 'value':
                assertion_value.append(dict[key])

    return assertion_line, assertion_value

def process_jsonl(input_file, output_file):
    # Open the input and output files
    idx = 0
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            # Parse each line as JSON
            data = json.loads(line)
            
            # Extract the "scratchpad_format" key
            source_code = data.get('Source Code')
            code = data.get("scratchpad_format")
            if not code or source_code in source_code_set:
                continue  # Skip lines without "scratchpad_format"
            
            # Process the code
            formatted_code, assertion_mapping = process_code(code)

            if formatted_code is None or assertion_mapping is None:
                continue

            assertion_line, value = generate_assertion_line_value(assertion_mapping)
            
            if assertion_line is None or value is None:
                continue

            if formatted_code is not None and len(assertion_line)>0:
            # Create the new JSON object
                new_data = {
                    "idx" : idx,
                    "Code": formatted_code,
                    "Assertion Line": assertion_line,
                    "Assertion Value": value
                }
                source_code_set.add(source_code)
                idx += 1
                
                # Write the new JSON object to the output file
                outfile.write(json.dumps(new_data) + "\n")


# Specify file names
input_jsonl = "fine_tune_data.jsonl"
output_jsonl = "step_by_step.jsonl"

# Run the processing
process_jsonl(input_jsonl, output_jsonl)
print(f"Processing complete. Saved to {output_jsonl}")

{options={}, NAME='CAISO'}
[]
"CAISO: No generation data at 2024-04-03T22:46:21.338362 with args {'latest': True}"
{options={}, NAME='PJM'}
[]
"PJM: No load data at 2024-04-03T22:47:45.327834 with args {'latest': True}"
200
[0, 100, 100, 0, 0, 0, 0, 0, 0, 0]
[0, 200, 100, 0, 0, 0, 0, 0, 0, 0]
0.0
0.621371192237334
3280.839895013123
0.0
0.5399568034557235
['DataFlow', 'ProxyDataFlow', 'RNGDataFlow', 'DataFlowTerminated']
['get_default_sess_config', 'get_global_step_value', 'get_global_step_var', 'get_tf_version_tuple', 'collect_env_info']
['Callback', 'ProxyCallback', 'CallbackFactory']
[]
2741
0.0
0
0
0
2417760
9676699
66673
3
4
1907
6623
7
4
2417760
2741
['Initialize the global variables of TensorFlow.', '', 'Run ``sess.run(tf.global_variables_initializer())`` for TF 0.12+ or', '``sess.run(tf.initialize_all_variables())`` for TF 0.11.', '', 'Parameters', '----------', 'sess : Session', '    TensorFlow session.']
'0.1.1'
[]
{subs_=None, sub_format='srt', encoding='infer', caching=False

In [1]:
import json

output_jsonl = "fine_tune_data.jsonl"

# Open the JSONL file and print all 'code' keys
with open(output_jsonl, "r") as file:
    for line in file:
        data = json.loads(line)
        print(data["scratchpad_format"])


# <INPUT> 'testing', '/home/XXX/.tsecrets' </INPUT>
def secrets_dir(env=os.getenv('D2_ENVIRONMENT', None),
                basedir=os.getenv('D2_SECRETS_BASEDIR', None)):
    if env is not None:
        env_str = str(env) # [STATE] env_str = 'testing' [/STATE]
    else:
        cwd = os.getcwd()
        default_file = os.path.join(cwd, '.python_secrets_environment')
        if os.path.exists(default_file):
            with open(default_file, 'r') as f:
                env_str = f.read().strip()
        else:
            env_str = os.path.basename(cwd)
    if basedir is None:
        basedir = os.path.join(
                HOME,
                'secrets' if sys.platform.startswith('win') else '.secrets')
    return os.path.join(basedir, env_str)
# <OUTPUT> '/home/XXX/.tsecrets/testing' </OUTPUT>

secrets_dir('testing', '/home/XXX/.tsecrets')
# <INPUT> 'testing' </INPUT>
def _identify_environment(environment=None):
    """
    Returns the environment identifier.

    There are multiple w