In [2]:
import json
import jsonlines
from ast import literal_eval

def int_value_mapping():
    return {
        "value == 0": "zero",
        "0 < value <= 10": "Positive Regular",
        "value > 10": "Positive Large",
        "-10 <= value < 0": "Negative Regular",
        "value < -10": "Negative Large"
    }


def float_value_mapping():
    return {
        "value == 0.0": "zero",
        "0.0 < value <= 1.0": "Positive Small",
        "1.0 < value <= 10.0": "Positive Regular",
        "value > 10.0": "Positive Large",
        "-1.0 <= value < 0.0": "Negative Small",
        "-10.0 <= value < -1.0": "Negative Regular",
        "value < -10.0": "Negative Large"
    }

def str_value_mapping():
    return {
        "len(value) == 0": "Empty String",
        "len(value) > 0 and value.isalpha()": "Alphabetic String",
        "len(value) > 0 and value.isdigit()": "Numeric String",
        "len(value) > 0 and not (value.isalpha() or value.isdigit())": "Mixed String"
    }

def list_value_mapping():
    return {
        "len(value) == 0": "Empty List",
        "len(value) > 0": "Non-Empty List"
    }

def tuple_value_mapping():
    return {
        "len(value) == 0": "Empty tuple",
        "len(value) > 0": "Non-Empty tuple"
    }

def dict_value_mapping():
    return {
        "len(value) == 0": "Empty dictionary",
        "len(value) > 0": "Non-Empty dictionary"
    }

def set_value_mapping():
    return {
        "len(value) == 0": "Empty set",
        "len(value) > 0": "Non-Empty set"
    }

def bool_value_mapping():
    return {
        "value == True": "True",
        "value == False": "False"
    }

def none_value_mapping():
    return {
        "value is None": "None"
    }

def quantize_value(value):
    """Quantize a value and return both the quantized category and the mapping rules"""
    if isinstance(value, int):
        concrete_quantize_mapping = int_value_mapping()
        if value == 0:
            return "Zero", concrete_quantize_mapping
        elif 0 < value <= 10:
            return "Positive Regular", concrete_quantize_mapping
        elif value > 10:
            return "Positive Large", concrete_quantize_mapping
        elif -10 <= value < 0:
            return "Negative Regular", concrete_quantize_mapping
        else:
            return "Negative Large", concrete_quantize_mapping

    elif isinstance(value, float):
        concrete_quantize_mapping = float_value_mapping()
        if value == 0.0:
            return "zero", concrete_quantize_mapping
        elif 0.0 < value <= 1.0:
            return "Positive Small", concrete_quantize_mapping
        elif 1.0 < value <= 10.0:
            return "Positive Regular", concrete_quantize_mapping
        elif value > 10.0:
            return "Positive Large", concrete_quantize_mapping
        elif -1.0 <= value < 0.0:
            return "Negative Small", concrete_quantize_mapping
        elif -10.0 <= value < -1.0:
            return "Negative Regular", concrete_quantize_mapping
        else:
            return "Negative Large", concrete_quantize_mapping

    elif isinstance(value, str):
        concrete_quantize_mapping = str_value_mapping()
        if len(value) == 0:
            return "Empty String", concrete_quantize_mapping
        elif value.isalpha():
            return "Alphabetic String", concrete_quantize_mapping
        elif value.isdigit():
            return "Numeric String", concrete_quantize_mapping
        else:
            return "Mixed String", concrete_quantize_mapping

    elif isinstance(value, list):
        concrete_quantize_mapping = list_value_mapping()
        return ("Empty List", concrete_quantize_mapping) if len(value) == 0 else ("Non-Empty List", concrete_quantize_mapping)

    elif isinstance(value, tuple):
        concrete_quantize_mapping = tuple_value_mapping()
        return ("Empty tuple", concrete_quantize_mapping) if len(value) == 0 else ("Non-Empty tuple", concrete_quantize_mapping)

    elif isinstance(value, dict):
        concrete_quantize_mapping = dict_value_mapping()
        return ("Empty dictionary", concrete_quantize_mapping) if len(value) == 0 else ("Non-Empty dictionary", concrete_quantize_mapping)

    elif isinstance(value, set):
        concrete_quantize_mapping = set_value_mapping()
        return ("Empty set", concrete_quantize_mapping) if len(value) == 0 else ("Non-Empty set", concrete_quantize_mapping)

    elif isinstance(value, bool):
        concrete_quantize_mapping = bool_value_mapping()
        return ("True", concrete_quantize_mapping) if value else ("False", concrete_quantize_mapping)

    elif value is None:
        concrete_quantize_mapping = none_value_mapping()
        return "None", concrete_quantize_mapping

    else:
        return "Unknown Type", {"Unknown Type": "No mapping available"}

# RQ1

In [None]:
def process_entry(entry):
    """Process a single entry to add quantization fields"""
    value_str = entry["Value After Statement Execution"]
    
    try:
        evaluated_value = literal_eval(value_str)
    except (ValueError, SyntaxError):
        evaluated_value = value_str
    
    quantized, concrete_quantize_mapping = quantize_value(evaluated_value)
    
    
    entry["quantized value"] = quantized
    entry["mapping_info"] = concrete_quantize_mapping
    
    return entry

def process_dataset(input_file, output_file):
    """Process the entire dataset"""
    with jsonlines.open(input_file) as reader, 
         jsonlines.open(output_file, mode='w') as writer:
        
        for entry in reader:
            processed_entry = process_entry(entry)
            writer.write(processed_entry)

if __name__ == "__main__":
    input_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset.jsonl"
    output_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset_python_quantized.jsonl"
    
    print(f"Processing {input_filename}...")
    process_dataset(input_filename, output_filename)
    print(f"Quantized dataset saved to {output_filename}")

# RQ3

In [5]:
# Loop

def process_loop_entry(entry):
    """Process a single loop iteration entry to add quantization fields"""
    try:
        # Try to evaluate the answer as an integer
        answer_value = int(entry["answer"])
    except (ValueError, TypeError):
        # Skip entries where answer is not an integer
        print("Value is not integer")
        return None
    
    quantized, concrete_quantize_mapping = quantize_value(answer_value)

    filtered_mapping = {
        k: v for k, v in concrete_quantize_mapping.items() 
        if "Negative" not in v and "value < 0" not in k
    }
    
    entry["quantized value"] = quantized
    entry["mapping_info"] = filtered_mapping
    
    return entry

def process_loop_dataset(input_file, output_file):
    """Process the entire loop iteration dataset"""
    with jsonlines.open(input_file) as reader, \
         jsonlines.open(output_file, mode='w') as writer:
        
        for entry in reader:
            processed_entry = process_loop_entry(entry)
            if processed_entry is not None:  # Only write entries with integer answers
                writer.write(processed_entry)

if __name__ == "__main__":
    input_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/loop_iteration_dataset_python.jsonl"
    output_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/loop_iteration_dataset_python_quantized.jsonl"
    
    print(f"Processing {input_filename}...")
    process_loop_dataset(input_filename, output_filename)
    print(f"Quantized dataset saved to {output_filename}")

Processing /home/XXX/CodeSemantic/CodeSemantic/dataset/loop_iteration_dataset_python.jsonl...
Value is not integer
Value is not integer
Quantized dataset saved to /home/XXX/CodeSemantic/CodeSemantic/dataset/loop_iteration_dataset_python_quantized.jsonl


In [13]:
def process_entry(entry):
    """Process a single entry to add quantization fields"""
    value_str = entry["answer"]
    
    try:
        evaluated_value = literal_eval(value_str)
    except (ValueError, SyntaxError):
        evaluated_value = value_str
    
    quantized, concrete_quantize_mapping = quantize_value(evaluated_value)
    
    entry["quantized value"] = quantized
    entry["mapping_info"] = concrete_quantize_mapping
    
    return entry

def process_dataset(input_file, output_file):
    """Process the entire dataset"""
    with jsonlines.open(input_file) as reader, \
         jsonlines.open(output_file, mode='w') as writer:
        
        for entry in reader:
            processed_entry = process_entry(entry)
            writer.write(processed_entry)

if __name__ == "__main__":
    input_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/loop_after_dataset_python.jsonl"
    output_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/loop_after_dataset_python_quantized.jsonl"
    
    print(f"Processing {input_filename}...")
    process_dataset(input_filename, output_filename)
    print(f"Quantized dataset saved to {output_filename}")

Processing /home/XXX/CodeSemantic/CodeSemantic/dataset/loop_after_dataset_python.jsonl...
Quantized dataset saved to /home/XXX/CodeSemantic/CodeSemantic/dataset/loop_after_dataset_python_quantized.jsonl


In [None]:
#input/output
def process_io_entry(entry):
    """Process a single entry to add quantization fields for both input and output"""
    # Process input
    input_str = entry["input"]
    try:
        evaluated_input = literal_eval(input_str)
    except (ValueError, SyntaxError):
        evaluated_input = input_str
    
    quantized_input, input_mapping = quantize_value(evaluated_input)
    
    # Process output
    output_str = entry["output"]
    try:
        evaluated_output = literal_eval(output_str)
    except (ValueError, SyntaxError):
        evaluated_output = output_str
    
    quantized_output, output_mapping = quantize_value(evaluated_output)
    

    entry["quantized_value_input"] = quantized_input
    entry["quantized_value_output"] = quantized_output
    entry["input_mapping_info"] = input_mapping
    entry["output_mapping_info"] = output_mapping
    return entry

def process_io_dataset(input_file, output_file):
    """Process the entire input-output dataset"""
    with jsonlines.open(input_file) as reader, \
         jsonlines.open(output_file, mode='w') as writer:
        
        for entry in reader:
            processed_entry = process_io_entry(entry)
            writer.write(processed_entry)

if __name__ == "__main__":
    input_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/input_output_dataset_python.jsonl"
    output_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/input_output_dataset_python_quantized.jsonl"
    
    print(f"Processing {input_filename}...")
    process_io_dataset(input_filename, output_filename)
    print(f"Quantized dataset saved to {output_filename}")

# RQ2

In [None]:
def process_io_entry(entry):
    """Process a single entry to add quantization fields for both input and output"""
    # Process input
    input_str = entry["input"]
    try:
        evaluated_input = literal_eval(input_str)
    except (ValueError, SyntaxError):
        evaluated_input = input_str
    
    quantized_input, input_mapping = quantize_value(evaluated_input)
    
    # Process output
    output_str = entry["output"]
    try:
        evaluated_output = literal_eval(output_str)
    except (ValueError, SyntaxError):
        evaluated_output = output_str
    
    quantized_output, output_mapping = quantize_value(evaluated_output)
    

    entry["quantized_value_input"] = quantized_input
    entry["quantized_value_output"] = quantized_output
    entry["input_mapping_info"] = input_mapping
    entry["output_mapping_info"] = output_mapping
    return entry

def process_io_dataset(input_file, output_file):
    """Process the entire input-output dataset"""
    with jsonlines.open(input_file) as reader, \
         jsonlines.open(output_file, mode='w') as writer:
        
        for entry in reader:
            processed_entry = process_io_entry(entry)
            writer.write(processed_entry)

if __name__ == "__main__":
    input_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/input_output_dataset_python.jsonl"
    output_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/input_output_dataset_python_quantized.jsonl"
    
    print(f"Processing {input_filename}...")
    process_io_dataset(input_filename, output_filename)
    print(f"Quantized dataset saved to {output_filename}")

In [2]:
def process_entry(entry):
    """Process a single entry to add quantization fields"""
    value_str = entry["Value After Statement Execution"]
    
    try:
        evaluated_value = literal_eval(value_str)
    except (ValueError, SyntaxError):
        evaluated_value = value_str
    
    quantized, concrete_quantize_mapping = quantize_value(evaluated_value)
    
    entry["quantized value"] = quantized
    entry["mapping_info"] = concrete_quantize_mapping
    
    return entry

def process_dataset(input_file, output_file):
    """Process the entire dataset"""
    with jsonlines.open(input_file) as reader, \
         jsonlines.open(output_file, mode='w') as writer:
        
        for entry in reader:
            processed_entry = process_entry(entry)
            writer.write(processed_entry)

if __name__ == "__main__":
    input_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/incremental_statement_prediction_python_10.jsonl"
    output_filename = "/home/XXX/CodeSemantic/CodeSemantic/dataset/incremental_statement_prediction_python_10_quantized.jsonl"
    
    print(f"Processing {input_filename}...")
    process_dataset(input_filename, output_filename)
    print(f"Quantized dataset saved to {output_filename}")

Processing /home/XXX/CodeSemantic/CodeSemantic/dataset/incremental_statement_prediction_python_10.jsonl...
Quantized dataset saved to /home/XXX/CodeSemantic/CodeSemantic/dataset/incremental_statement_prediction_python_10_quantized.jsonl
