In [None]:
import tree_sitter
from tree_sitter_languages import get_parser

def extract_variables_from_line(line):
    parser = get_parser("c")
    tree = parser.parse(line.encode())
    root = tree.root_node
    
    variables = set()

    def is_variable_node(node):
        # Check if this identifier is being used as a variable
        parent = node.parent
        if not parent:
            return False
            
        # Cases where identifier is a variable:
        # 1. In a binary expression (e.g., bytes in "bytes == NULL")
        # 2. As a function argument (e.g., bytes in "func(bytes)")
        # 3. In an assignment (either side)
        # 4. In a declaration (but declarations usually need full context)
        return parent.type in [
            'binary_expression',
            'argument_list',
            'assignment_expression',
            'declaration'
        ]

    def traverse(node):
        if node.type == "identifier" and is_variable_node(node):
            variables.add(node.text.decode())
        
        for child in node.children:
            traverse(child)
    
    traverse(root)
    return variables

# Example usage
line = "if (bytes == NULL || k5_utf16le_to_utf8(bytes, actual_count * 2, out) != 0)"
variables = extract_variables_from_line(line)
print(variables)

{'bytes', 'actual_count', 'out'}


In [1]:
import json

def print_dataset_info(filepath):
    with open(filepath, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Print basic information
                print("="*80)
                print(f"Project: {data.get('Program Information', {}).get('Project Name', 'N/A')}")
                print(f"Index: {data.get('idx', 'N/A')}")
                print(f"Programming Language: {data.get('Programming Language', 'N/A')}")
                
                # Print source code with line numbers
                print("\nSource Code:")
                source_code = data.get('Source Code', '')
                for i, line in enumerate(source_code.split('\n'), 1):
                    print(f"{i:4d} | {line}")
                
                # Print selected and compared statements
                print("\nSelected Statement:")
                print(data.get('Selected Statement', 'N/A'))
                print(f"Selected Pointer: {data.get('Selected Pointer', 'N/A')}")
                
                print("\nCompared Statement:")
                print(data.get('Compared Statement', 'N/A'))
                print(f"Compared Pointer: {data.get('Compared Pointer', 'N/A')}")
                print(f"Aliasing: {data.get('Aliasing', 'N/A')}")
                
                # Print function input
                print("\nFunction Input:")
                for var, details in data.get('Function Input', {}).items():
                    print(f"  {var}:")
                    for k, v in details.items():
                        print(f"    {k}: {v}")
                
                print("="*80)
                print("\n")
                
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")
                continue

# Path to your dataset
dataset_path = "/home/XXX/Tracing/Annotation/C Benchmark Collection/RQ-3/aliasing_dataset_new.jsonl"

# Call the function
print_dataset_info(dataset_path)

Project: tarantool
Index: N/A
Programming Language: C

Source Code:
   1 | static void decStatus(decNumber *dn, uInt status, decContext *set) {
   2 |   if (status & DEC_NaNs) {              
   3 |     
   4 |     if (status & DEC_sNaN) status&=~DEC_sNaN;
   5 |      else {
   6 |       decNumberZero(dn);                
   7 |       dn->bits=DECNAN;                  
   8 |       }
   9 |     }
  10 |   decContextSetStatus(set, status);     
  11 |   return;
  12 |   }

Selected Statement:
  decContextSetStatus(set, status);     // [may not return]

Selected Pointer: set

Compared Statement:
      decNumberZero(dn);                // other error: clean throughout

Compared Pointer: dn
Aliasing: No

Function Input:
  dn:
    type_category: pointer
    concrete_type: POINTER
    address: 0x7fffffffdb88
    value: {'type_category': 'struct', 'concrete_type': 'STRUCT', 'value': {'digits': {'type_category': 'unknown', 'concrete_type': 'int32_t', 'value': '1'}, 'exponent': {'type_category'

In [1]:
import json

input_file = "aliasing_dataset_new.jsonl"
output_file = "aliasing_dataset_with_idx.jsonl"

with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for idx, line in enumerate(infile):
        try:
            # Load JSON entry
            entry = json.loads(line)
            
            # Add idx at the beginning of the entry
            entry = {"idx": idx, **entry}
            
            # Write modified entry to new file
            outfile.write(json.dumps(entry) + '\n')
            
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON at line {idx+1}")
            continue

print(f"Indexed dataset saved to {output_file}")

Indexed dataset saved to aliasing_dataset_with_idx.jsonl


In [4]:
import json

def print_dataset_info(filepath):
    with open(filepath, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Print basic information
                print("="*80)
                print(f"Project: {data.get('Program Information', {}).get('Project Name', 'N/A')}")
                print(f"Index: {data.get('idx', 'N/A')}")
                print(f"Programming Language: {data.get('Programming Language', 'N/A')}")
                
                # Print source code with line numbers
                print("\nSource Code:")
                source_code = data.get('Source Code', '')
                for i, line in enumerate(source_code.split('\n'), 1):
                    print(f"{i:4d} | {line}")
                
                # Print selected and compared statements
                print("\nSelected Statement:")
                print(data.get('Selected Statement', 'N/A'))
                print(f"Selected Pointer: {data.get('Selected Pointer', 'N/A')}")
                
                
                # Print function input
                print("\nFunction Input:")
                for var, details in data.get('Function Input', {}).items():
                    print(f"  {var}:")
                    for k, v in details.items():
                        print(f"    {k}: {v}")
                
                print("="*80)
                print("\n")
                
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")
                continue

# Path to your dataset
dataset_path = "/home/XXX/Tracing/Annotation/C Benchmark Collection/RQ-1/Statement_Based_Analysis/output/Assignment.jsonl"

# Call the function
print_dataset_info(dataset_path)

Project: N/A
Index: N/A
Programming Language: C

Source Code:
   1 | 
   2 | void
   3 | fiber_init(int (*invoke)(fiber_func f, va_list ap))
   4 | {
   5 | 	page_size = small_getpagesize();
   6 | 	stack_direction = check_stack_direction(__builtin_frame_address(0));
   7 | 	fiber_invoke = invoke;
   8 | 	main_thread_id = pthread_self();
   9 | 	main_cord.loop = ev_default_loop(EVFLAG_AUTO | EVFLAG_ALLOCFD);
  10 | 	if (main_cord.loop == NULL)
  11 | 		panic("can't init event loop");
  12 | 	cord_create(&main_cord, "main");
  13 | 

Selected Statement:
fiber_invoke = invoke;
Selected Pointer: N/A

Function Input:


Project: N/A
Index: N/A
Programming Language: C

Source Code:
   1 | void
   2 | mempool_create_with_order(struct mempool *pool, struct slab_cache *cache,
   3 | 			  uint32_t objsize, uint8_t order)
   4 | {
   5 | 	assert(order <= cache->order_max);
   6 | 	pool->cache = cache;
   7 | 	slab_list_create(&pool->slabs);
   8 | 	mslab_tree_new(&pool->hot_slabs);
   9 | 	pool->