In [4]:
import json

file_path = '/home/XXX/CodeSemantic/CodeSemantic/dataset/loop_after_dataset_python_quantized.jsonl'

with open(file_path, 'r') as file:
    for i, line in enumerate(file, 1):
        try:
            data = json.loads(line)
            print(f"\nEntry #{i}:")
            print("Loop Code:")
            print(data.get('loop_code', 'No code found'))
            print("\nQuestion:")
            print(data.get('question', 'No question found'))
            print("\nAnswer:")
            print(data.get('answer', 'No answer found'))
            print("-" * 50)
        except json.JSONDecodeError:
            print(f"Error decoding line {i}")


Entry #1:
Loop Code:
1: def add_active_line_prints(code):
2:     """
3:     Add print statements indicating line numbers to a python string.
4:     """
5:     # Replace newlines and comments with pass statements, so the line numbers are accurate (ast will remove them otherwise)
6:     code_lines = code.split("\n")
7:     in_multiline_string = False
8:     for i in range(len(code_lines)):
9:         line = code_lines[i]
10:         if '"""' in line or "'''" in line:
11:             in_multiline_string = not in_multiline_string
12:         if not in_multiline_string and (line.strip().startswith("#") or line == ""):
13:             whitespace = len(line) - len(line.lstrip(" "))
14:             code_lines[i] = " " * whitespace + "pass"
15:     processed_code = "\n".join(code_lines)
16:     try:
17:         tree = ast.parse(processed_code)
18:     except:
19:         # If you can't parse the processed version, try the unprocessed version before giving up
20:         tree = ast.parse(code)


In [1]:
import json

file_path = "/home/XXX/CodeSemantic/CodeSemantic/Detailed_Results/Llama-3.1-8B-Instruct/input/2/Prompt_3_CoT_no_incontext_different_quantization_no.jsonl"

with open(file_path, "r") as f:
    for line in f:
        data = json.loads(line)
        source_code = data.get("original_task", {}).get("Source Code", "")
        prompt = data.get("prompt", [])  # Get as list (default empty list)
        original_value = data.get("ground_truth", "")
        model_prediction = data.get("model_prediction", "")
        parsed_prediction = data.get("parsed_prediction", "")
        parsed_result = data.get("parsed_result", "")
        
        print("="*80)
        print("=== Source Code ===")
        print(source_code)
        
        print("\n=== Prompt ===")
        if isinstance(prompt, list):
            for msg in prompt:
                print(f"\nRole: {msg.get('role', 'unknown')}")
                content = msg.get('content', '')
                print(content)

        else:
            print(prompt) 
        
        print("\n=== Original Value ===")
        print(original_value)
        
        print("\n=== Model Prediction ===")
        print(model_prediction)
        
        print("\n=== Parsed Prediction ===")
        print(parsed_prediction)
        
        print("\n=== Parsed Result ===")
        print(parsed_result)
        
        print("\n" + "="*80 + "\n")

=== Source Code ===


=== Prompt ===

Role: user
Analyze this python function's behavior:
- How do inputs flow through the operations?
- What patterns connect inputs to outputs?
- Can we reconstruct missing I/O elements?

2 demonstration cases:

----------------------------------------
EXAMPLE 1:
Reverse-engineer this python function:
```python
def to_str(s, encoding=None, errors="strict", normalize=False):
    

    def _normalize(s):
        try:
            return unicodedata.normalize("NFC", s) if normalize else s
        except TypeError:
            return s

    if encoding is None:
        
        encoding = ("utf-8", __salt_system_encoding__)
    if not isinstance(encoding, (tuple, list)):
        encoding = (encoding,)

    if not encoding:
        raise ValueError("encoding cannot be empty")

    if isinstance(s, str):
        return _normalize(s)

    exc = None
    if isinstance(s, (bytes, bytearray)):
        for enc in encoding:
            try:
                return _

In [2]:
import json
from collections import defaultdict

# Load data
data = []
with open("Prompt_Validation_Results_new.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Initialize data structures
prompt_stats_quant_yes = defaultdict(list)
prompt_stats_quant_no = defaultdict(list)

# Process each entry
for entry in data:
    prompt = entry["Prompt"]
    quant = entry["quantization"]
    accuracy = entry["accuracy"]
    
    if quant == "yes":
        prompt_stats_quant_yes[prompt].append(accuracy)
    elif quant == "no":
        prompt_stats_quant_no[prompt].append(accuracy)

# Calculate average accuracy for each prompt
def calculate_averages(stats):
    return {prompt: sum(accs)/len(accs) for prompt, accs in stats.items()}

avg_quant_yes = calculate_averages(prompt_stats_quant_yes)
avg_quant_no = calculate_averages(prompt_stats_quant_no)

# Find best prompts
best_prompt_quant_yes = max(avg_quant_yes.items(), key=lambda x: x[1])
best_prompt_quant_no = max(avg_quant_no.items(), key=lambda x: x[1])

print("Quantization YES results:")
print(f"Best prompt: {best_prompt_quant_yes[0]} with average accuracy {best_prompt_quant_yes[1]:.4f}")
print("All prompts averages:", {k: round(v, 4) for k, v in avg_quant_yes.items()})

print("\nQuantization NO results:")
print(f"Best prompt: {best_prompt_quant_no[0]} with average accuracy {best_prompt_quant_no[1]:.4f}")
print("All prompts averages:", {k: round(v, 4) for k, v in avg_quant_no.items()})

Quantization YES results:
Best prompt: 2 with average accuracy 0.5770
All prompts averages: {0: 0.5341, 1: 0.5581, 2: 0.577, 3: 0.4165, 4: 0.5429}

Quantization NO results:
Best prompt: 1 with average accuracy 0.4659
All prompts averages: {0: 0.4562, 1: 0.4659, 2: 0.4479, 3: 0.1554, 4: 0.3515}


In [1]:
import json
from collections import defaultdict

def analyze_duplicates(file_path):
    # Dictionary to store source codes and their counts
    source_code_counts = defaultdict(int)
    # Dictionary to store full entries by source code
    entries_by_source = defaultdict(list)
    
    # Read the JSONL file
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                entry = json.loads(line)
                source_code = entry.get("Source Code", "")
                source_code_counts[source_code] += 1
                entries_by_source[source_code].append(entry)
            except json.JSONDecodeError:
                print(f"Skipping malformed line: {line}")
    
    # Calculate statistics
    total_entries = sum(source_code_counts.values())
    unique_source_codes = len(source_code_counts)
    duplicate_source_codes = {k: v for k, v in source_code_counts.items() if v > 1}
    num_duplicate_source_codes = len(duplicate_source_codes)
    
    print(f"Total entries: {total_entries}")
    print(f"Unique source codes: {unique_source_codes}")
    print(f"Source codes with duplicates: {num_duplicate_source_codes}")
    print(f"Percentage with duplicates: {num_duplicate_source_codes/unique_source_codes*100:.2f}%")
    
    # Print some examples of duplicates
    print("\nExample duplicate source codes:")
    for i, (source_code, count) in enumerate(list(duplicate_source_codes.items())[:5]):
        print(f"\nSource Code {i+1} (appears {count} times):")
        print(source_code[:200] + ("..." if len(source_code) > 200 else ""))
        print("\nExample entries with this source code:")
        for j, entry in enumerate(entries_by_source[source_code][:2]):
            print(f"  Entry {j+1} IDX: {entry.get('idx', 'N/A')}")
            print(f"  Input: {entry.get('Function Input', {}).get('client_name', 'N/A')}")
    
    return entries_by_source

# Run the analysis
file_path = "/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset_python_quantized.jsonl"
entries_by_source = analyze_duplicates(file_path)

Total entries: 545
Unique source codes: 346
Source codes with duplicates: 143
Percentage with duplicates: 41.33%

Example duplicate source codes:

Source Code 1 (appears 2 times):
def client_factory(client_name, **kwargs):
    """Return a client for an external data set"""
    # set up
    dir_name = os.path.dirname(os.path.abspath(__file__))
    error_msg = 'No client found fo...

Example entries with this source code:
  Entry 1 IDX: 1
  Input: 'MISO'
  Entry 2 IDX: 406
  Input: 'MISO'

Source Code 2 (appears 4 times):
def xldate_as_tuple(xldate, datemode):
    if datemode not in (0, 1):
        raise XLDateBadDatemode(datemode)
    if xldate == 0.00:
        return (0, 0, 0, 0, 0, 0)
    if xldate < 0.00:
        r...

Example entries with this source code:
  Entry 1 IDX: 3
  Input: N/A
  Entry 2 IDX: 75
  Input: N/A

Source Code 3 (appears 2 times):
def xldate_from_date_tuple(date_tuple, datemode):
    """Create an excel date from a tuple of (year, month, day)"""
    year, month, da

In [1]:
import json
import statistics

# Load the JSONL file
code_lengths = []
with open('/home/XXX/CodeSemantic/CodeSemantic/dataset/input_output_dataset_python_quantized.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        code_lengths.append(data['code_length'])

# Calculate quartiles
q1 = statistics.quantiles(code_lengths, n=4)[0]  # 25th percentile
q2 = statistics.quantiles(code_lengths, n=4)[1]  # 50th percentile (median)
q3 = statistics.quantiles(code_lengths, n=4)[2]  # 75th percentile

print(f"Q1 (25th percentile): {q1}")
print(f"Q2 (Median): {q2}")
print(f"Q3 (75th percentile): {q3}")

Q1 (25th percentile): 6.0
Q2 (Median): 9.0
Q3 (75th percentile): 17.0


In [2]:
import json
import random
from collections import defaultdict

input_path = "/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset_C.jsonl"
output_path = "/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset_c_200.jsonl"

# Load data
data = []
with open(input_path, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

# Group by 'Statement Type'
grouped = defaultdict(list)
for item in data:
    stmt_type = item.get("Statement Type")
    if stmt_type:
        grouped[stmt_type].append(item)

# Sample 40 from each Statement Type
sampled_data = []
for stmt_type, items in grouped.items():
    sample_size = min(40, len(items))
    sampled_items = random.sample(items, sample_size)
    sampled_data.extend(sampled_items)
    print(f"Statement Type: {stmt_type} | Available: {len(items)} | Sampled: {sample_size}")

# Save to output JSONL file
with open(output_path, 'w') as f:
    for item in sampled_data:
        f.write(json.dumps(item) + '\n')

print(f"\nSaved sampled dataset to {output_path}")


Statement Type: Constant Assignment | Available: 104 | Sampled: 40
Statement Type: Assignment | Available: 119 | Sampled: 40
Statement Type: Arithmetic Assignment | Available: 24 | Sampled: 24
Statement Type: Branch | Available: 169 | Sampled: 40
Statement Type: Function Call | Available: 82 | Sampled: 40

Saved sampled dataset to /home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset_c_200.jsonl


In [None]:
import json
import random
from collections import defaultdict

input_path = "/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset_C.jsonl"
output_path = "/home/XXX/CodeSemantic/CodeSemantic/dataset/statement_prediction_dataset_c_200.jsonl"

# Load data
data = []
with open(input_path, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

# Group by 'Statement Type'
grouped = defaultdict(list)
for item in data:
    stmt_type = item.get("Statement Type")
    if stmt_type:
        grouped[stmt_type].append(item)

# Sample 40 from each Statement Type
sampled_data = []
for stmt_type, items in grouped.items():
    sample_size = min(40, len(items))
    sampled_items = random.sample(items, sample_size)
    sampled_data.extend(sampled_items)
    print(f"Statement Type: {stmt_type} | Available: {len(items)} | Sampled: {sample_size}")

# Save to output JSONL file
with open(output_path, 'w') as f:
    for item in sampled_data:
        f.write(json.dumps(item) + '\n')

print(f"\nSaved sampled dataset to {output_path}")



Id: 0
Loop Code:
1: def find(lst, key, value):
2:     for i, dic in enumerate(lst):
3:         if dic[key] == value:
4:             return i
5:     return None
6:
7: find([{'Variable': 'jenkins_admin_password', 'Type': 'password'}, {'Variable': 'ca_rootca_password', 'Type': 'password'}], 'Variable', 'something_not_there')
Question: What is the value of ' i ' in line '2' after '1' th iteration when 'find([{'Variable': 'jenkins_admin_password', 'Type': 'password'}, {'Variable': 'ca_rootca_password', 'Type': 'password'}], 'Variable', 'something_not_there')' is executed?
Model Prediction: <ans>0</ans>

In the first iteration of the loop, `enumerate(lst)` returns a tuple containing a count (from the start which defaults to 0) and the values obtained from iterating over the sequence (`lst`). Therefore, after the first iteration, `i` will be 0 and `dic` will be `{'Variable': 'jenkins_admin_password', 'Type': 'password'}`.
Parsed Prediction: 0
Parsed Result: False
-----------------------------

In [2]:
import json
import random
from collections import defaultdict

input_path = "/home/XXX/CodeSemantic/CodeSemantic/dataset/block_analysis_c.jsonl"
output_path = "/home/XXX/CodeSemantic/CodeSemantic/dataset/block_analysis_c_200.jsonl"

# Load data
data = []
with open(input_path, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

# Group by 'Block_Size'
grouped = defaultdict(list)
for item in data:
    block_size = item.get("Block_Size")
    if block_size is not None:
        grouped[block_size].append(item)

# Sample 20 from each Block_Size (1 to 10)
sampled_data = []
for block_size in range(1, 11):
    items = grouped.get(block_size, [])
    sample_size = min(20, len(items))
    if sample_size > 0:
        sampled_items = random.sample(items, sample_size)
        sampled_data.extend(sampled_items)
        print(f"Block_Size: {block_size} | Available: {len(items)} | Sampled: {sample_size}")
    else:
        print(f"Block_Size: {block_size} | Available: 0 | Sampled: 0")

# Save to output JSONL file
with open(output_path, 'w') as f:
    for item in sampled_data:
        f.write(json.dumps(item) + '\n')

print(f"\nSaved sampled dataset to {output_path}")

Block_Size: 1 | Available: 26 | Sampled: 20
Block_Size: 2 | Available: 25 | Sampled: 20
Block_Size: 3 | Available: 30 | Sampled: 20
Block_Size: 4 | Available: 32 | Sampled: 20
Block_Size: 5 | Available: 25 | Sampled: 20
Block_Size: 6 | Available: 28 | Sampled: 20
Block_Size: 7 | Available: 26 | Sampled: 20
Block_Size: 8 | Available: 27 | Sampled: 20
Block_Size: 9 | Available: 30 | Sampled: 20
Block_Size: 10 | Available: 25 | Sampled: 20

Saved sampled dataset to /home/XXX/CodeSemantic/CodeSemantic/dataset/block_analysis_c_200.jsonl
