In [1]:
import os
import clang
from clang.cindex import *
from copy import deepcopy
import re
import random

In [2]:
Config.set_library_file("/home/dipu/anaconda3/lib/python3.9/site-packages/clang/native/libclang.so")

In [3]:
index = Index.create()
tu = index.parse("main.c")
root_cursor = tu.cursor

In [4]:
def get_spelling(node):
    children = list(node.get_children())
    tokens_list = list(node.get_tokens())
    
    if node.kind == CursorKind.UNARY_OPERATOR:
        tokens = tokens_list[0].spelling
        return str(tokens)
    elif node.kind == CursorKind.BINARY_OPERATOR:
        left_list = list(children[0].get_tokens())
        right_list = list(children[1].get_tokens())
        left = "".join([token.spelling for token in left_list])
        right = "".join([token.spelling for token in right_list])
        tokens = "".join([token.spelling for token in tokens_list])
        
        if len(left_list) + len(right_list) == len(tokens_list):
            tokens = tokens.replace(left, "", 1)
            tokens = tokens.replace(right, "", 1)
            return tokens
        elif len(tokens_list) == 3:
            return tokens_list[1].spelling
        elif len(left_list) < len(tokens_list):
            return tokens_list[len(left_list)].spelling
        elif len(right_list) < len(tokens_list):
            return tokens_list[len(tokens_list)-len(right_list)-1].spelling
        else:
            return node.spelling

    else:
        return node.spelling

In [None]:
def print_ast(node, indent):
    try:
        current_name = "".join([node.spelling for x in list(node.get_tokens())]) if len(list(node.get_tokens())) > 0 else node.spelling
        # print(" "*indent + node.spelling + " " + str(node.kind) + " " + str(node.type.spelling))
        print("  "*indent + get_spelling(node) + " " + str(node.kind))
        for c in node.get_children():
            print_ast(c, indent+2)
    except ValueError:
        pass

print_ast(root_cursor, 0)

In [6]:
# regex for matching string containing numbers, capital and small letters, operators (except ternary and assignment), parentheses and comma only
expression_pattern = re.compile(r"^[a-zA-Z0-9_().,+\-*/%<>=!&|~\^]+$")

# regex for valid multiplication operator or & operator (not pointer)
left_pattern = re.compile(r"^[a-zA-Z0-9_)]+$")
right_pattern = re.compile(r"^[a-zA-Z0-9_(!~]+$")

# operator list (except * &)
operator_list = ["+", "-", "/", "%", "++", "--", "<", "<=", ">", ">=", "==", "!=", "&&", "||", "!", "|", "<<", ">>", "~", "^"]

In [7]:
def is_required_expression(node):
    # avoiding equals to ('=') operator
    if list(node.get_children())[0].kind == CursorKind.DECL_REF_EXPR:
        return False
    
    tokens = list(node.get_tokens())
    operator_set = set()
    
    for i in range(len(tokens)):
        spell = str(tokens[i].spelling)
        if (not expression_pattern.match(spell)) or (spell == "="):
            return False
        
        if (spell in operator_list) or \
        ((spell == "*" or spell == "&") and 0 < i < len(tokens)-1 and \
        left_pattern.match(str(tokens[i-1].spelling)) and \
        right_pattern.match(str(tokens[i+1].spelling))):
            operator_set.add(spell)
            
    
    # expression with atleast two different operators is needed
    if len(operator_set) <= 1:
        return False
    elif len(operator_set) == 2:
        # excluding operations having only '+' and '-' as both have same precedence
        if ('+' in operator_set) and ('-' in operator_set):
            return False
        # excluding operations having only '*' and '/' as both have same precedence
        elif ('*' in operator_set) and ('/' in operator_set):
            return False
                
    return True

In [8]:
precedence = {
    '++': 1, '--': 1, '!': 1, '~': 1,
    '*': 2, '/': 2, '%': 2,
    '+': 3, '-': 3,
    '<<': 4, '>>': 4,
    '<': 5, '>': 5, '<=': 5, '>=': 5,
    '==': 6, '!=': 6,
    '&': 7,
    '^': 8,
    '|': 9,
    '&&': 10,
    '||': 11
    }

# checks if removal of parenthesis results in precedence bug (i.e. different result) in the operation
def is_precedence_higher(op1_node, op2_node, parenthesis_at_right):
    op1 = get_spelling(op1_node)
    op2 = get_spelling(op2_node)
    
    # get the precedence of op1 and op2, default to 999 if not found
    precedence_op1 = precedence.get(op1, 999)
    precedence_op2 = precedence.get(op2, 999)
    
    if op1_node.kind == CursorKind.UNARY_OPERATOR and op1 == "-":
        precedence_op1 = 1
        
    if op2_node.kind == CursorKind.UNARY_OPERATOR and op2 == "-":
        precedence_op2 = 1
        
    # There is exception in case of '*', '/' and '%'
    # if '%' is done before or after '*' or '/' then result will vary unlike doing '*' before or after '/'
    # so removal of parantheses can create precedence bug due to left to right associativity
    if parenthesis_at_right and \
    ((op1 == '%' and (op2 == '*' or op2 == '/')) or ((op1 == '*' or op1 == '/') and op2 == '%')):
        return True
        
    return precedence_op1 < precedence_op2

In [9]:
def buggy_sample_by_parenthesis_removal(node, parent_node, grandparent_node, replacable):
    
    if node and node.kind == CursorKind.PAREN_EXPR:
        children = list(node.get_children())
        
        if len(children) > 0 and children[0].kind == CursorKind.BINARY_OPERATOR:
            if parent_node:
                if parent_node.kind == CursorKind.BINARY_OPERATOR or parent_node.kind == CursorKind.UNARY_OPERATOR:
                    grand_children = list(children[0].get_children())

                    if len(grand_children) > 0: # and grand_children[0].kind != CursorKind.BINARY_OPERATOR and grand_children[0].kind != CursorKind.PAREN_EXPR:
                        
                        parenthesis_at_left = node == list(parent_node.get_children())[0]
                        if is_precedence_higher(parent_node, children[0], not parenthesis_at_left):
                            # remove parenthesis of node (i.e. return tokens of children[0])
                            replacable.append([token.spelling for token in list(node.get_tokens())])
                elif parent_node.kind == CursorKind.UNEXPOSED_EXPR and grandparent_node and \
                (grandparent_node.kind == CursorKind.BINARY_OPERATOR or grandparent_node.kind == CursorKind.UNARY_OPERATOR):
                    grand_children = list(children[0].get_children())

                    if len(grand_children) > 0: # and grand_children[0].kind != CursorKind.BINARY_OPERATOR and grand_children[0].kind != CursorKind.PAREN_EXPR:
                        
                        parenthesis_at_left = parent_node == list(grandparent_node.get_children())[0]
                        if is_precedence_higher(grandparent_node, children[0], not parenthesis_at_left):
                            # remove parenthesis of node (i.e. return tokens of children[0])
                            replacable.append([token.spelling for token in list(node.get_tokens())])

    for child in node.get_children():
        buggy_sample_by_parenthesis_removal(child, node, parent_node, replacable)

In [10]:
def buggy_sample_by_parenthesis_insertion(node, parent_node, grandparent_node, replacable):
    
    if node and node.kind == CursorKind.BINARY_OPERATOR and node.kind != CursorKind.PAREN_EXPR \
    and parent_node and parent_node.kind == CursorKind.BINARY_OPERATOR:
        
        # check if precedence of current node is greater than that of parent node
        if is_precedence_higher(node, parent_node, None):
            parent_children = list(parent_node.get_children())
            children = list(node.get_children())
            
            if len(parent_children) == 2 and len(children) == 2:
                temp_node = node
                temp_parent_node = parent_node
                right_side_node = parent_children[1] == node

                while (temp_node and temp_node.kind == CursorKind.BINARY_OPERATOR):
                    temp_children = list(temp_node.get_children())
                    parent_children_index = 0 if right_side_node else 1

                    if len(temp_children) == 2:
                        while (temp_parent_node and temp_parent_node.kind == CursorKind.BINARY_OPERATOR):
                            temp_parent_children = list(temp_parent_node.get_children())

                            if len(temp_parent_children) == 2:
                                if right_side_node:
                                    # present node is right node of parent
                                    tokens_for_parenthesis_addition = [token.spelling for token in list(temp_parent_children[parent_children_index].get_tokens())] \
                                                                    + [get_spelling(parent_node)] \
                                                                    + [token.spelling for token in list(temp_children[0].get_tokens())]
                                    replacable.append(tokens_for_parenthesis_addition)
                                else:
                                    # present node is left node of parent
                                    tokens_for_parenthesis_addition = [token.spelling for token in list(temp_children[1].get_tokens())] \
                                                                    + [get_spelling(parent_node)] \
                                                                    + [token.spelling for token in list(temp_parent_children[parent_children_index].get_tokens())]
                                    replacable.append(tokens_for_parenthesis_addition)

                                temp_parent_node = temp_parent_children[parent_children_index]
                                parent_children_index = 1 if right_side_node else 0
                            else:
                                break

                        temp_node = temp_children[0 if right_side_node else 1]
                    else:
                        break   
                     
                    
                """   
                # previous code
                if len(parent_children) == 2 and len(children) == 2:
                    if parent_children[1] == node:
                        present node is right node of parent
                        tokens_for_parenthesis_addition = [token.spelling for token in list(parent_children[0].get_tokens())] \
                                                        + [get_spelling(parent_node)] \
                                                        + [token.spelling for token in list(children[0].get_tokens())]
                        replacable.append(tokens_for_parenthesis_addition)
                    else:
                        present node is left node of parent
                        tokens_for_parenthesis_addition = [token.spelling for token in list(children[1].get_tokens())] \
                                                        + [get_spelling(parent_node)] \
                                                        + [token.spelling for token in list(parent_children[1].get_tokens())]
                        replacable.append(tokens_for_parenthesis_addition)
                """

                
    for child in node.get_children():
        buggy_sample_by_parenthesis_insertion(child, node, parent_node, replacable)

In [11]:
def remove_parenthesis(full_list, remove_list):
    for i in range(len(full_list)):
        match_count = 0
        for j in range(len(remove_list)):
            if full_list[i+j] == remove_list[j]:
                match_count += 1
            else:
                break
                
        if len(remove_list) == match_count:
            return full_list[0:i] + full_list[i+1:i+len(remove_list)-1] + full_list[i+len(remove_list):]
            
    return full_list

In [12]:
def insert_parenthesis(full_list, insert_list):
    for i in range(len(full_list)):
        match_count = 0
        for j in range(len(insert_list)):
            if i+j < len(full_list) and full_list[i+j] == insert_list[j]:
                match_count += 1
            else:
                break
        
        if len(insert_list) == match_count:
            # check if parenthesis was already inserted as there can be two same expressions
            if i==0 or (i+len(insert_list) >= len(full_list)) or full_list[i-1] != "(" or full_list[i+len(insert_list)] != ")":
                return full_list[0:i] + ["("] + full_list[i:i+len(insert_list)] + [")"] + full_list[i+len(insert_list):]
            
    return full_list

In [13]:
def get_binary_expressions(node, exp_list):
    try:
        if (node.kind == CursorKind.BINARY_OPERATOR or node.kind == CursorKind.UNARY_OPERATOR)\
        and is_required_expression(node):
            tokens = list(node.get_tokens())
            expression = [token.spelling for token in tokens]
            
            """
            Method 1: Change of precedence by removal of parenthesis
            """
            replacable = []
            buggy_sample_by_parenthesis_removal(node, None, None, replacable)
            
            if len(replacable) > 0:
                random.shuffle(replacable)
                replacable = sorted(replacable, key=lambda x: len(x), reverse=True)
                number_of_replacement = random.randrange(len(replacable)) + 1
                
                random_indices = random.sample(range(0, len(replacable)), number_of_replacement)
                random_indices.sort()   
                
                buggy_sample = expression.copy()
                for idx in random_indices:
                    buggy_sample = remove_parenthesis(buggy_sample, replacable[idx])
                
                if expression != buggy_sample:
                    exp_list.append({
                        "expression": expression,
                        "buggy_sample": buggy_sample,
                        "method": "parenthsis_removal"
                    })
                
                
            """
            Method 2: Change of precedence by insertion of parenthesis
            """
            replacable = []
            buggy_sample_by_parenthesis_insertion(node, None, None, replacable)
            
            if len(replacable) > 0:
                # remove duplicates
                replacable = [list(t) for t in {tuple(sublist) for sublist in replacable}]
            
                random.shuffle(replacable)
                replacable = sorted(replacable, key=lambda x: len(x), reverse=True)
                number_of_replacement = random.randrange(len(replacable)) + 1
                
                random_indices = random.sample(range(0, len(replacable)), number_of_replacement)
                random_indices.sort()
                
                buggy_sample = expression.copy()
                for idx in random_indices:
                    buggy_sample = insert_parenthesis(buggy_sample, replacable[idx])
                
                if expression != buggy_sample:
                    exp_list.append({
                        "expression": expression,
                        "buggy_sample": buggy_sample,
                        "method": "parenthsis_insertion"
                    })
                
                
        else:
            for child in node.get_children():
                get_binary_expressions(child, exp_list)
                
    except Exception as e:
        # print("***Exception***", e)
        pass

In [14]:
exp_list = []
get_binary_expressions(root_cursor, exp_list)
print(exp_list)

[{'expression': ['a', '/', '(', '(', 'b', '*', 'a', ')', '%', 'b', ')'], 'buggy_sample': ['a', '/', '(', 'b', '*', 'a', ')', '%', 'b'], 'method': 'parenthsis_removal'}, {'expression': ['a', '*', 'b', '+', 'c', '*', 'd'], 'buggy_sample': ['(', 'a', '*', 'b', '+', 'c', ')', '*', 'd'], 'method': 'parenthsis_insertion'}]


---
## Using C Code Corpus Dataset
---

In [15]:
def generate_binary_operator_expression_dataset(root_dir):
    total_files, total_samples = 0, 0
    global current_file
    
    with open("operator_precedence_bug_dataset.csv", "a") as dataset:
#         dataset.write(f"operator_expression\tfile_path\tmethod\tlabel\n")
        
        for root, dirs, files in os.walk(root_dir):
                for file in files:
                    if file.endswith(".c"):
                        total_files += 1

                        file_path = os.path.join(root, file)
                        current_file = file_path
                            
                        if total_files > 505000:
                            
                            with open(file_path, 'rb') as f:
                                content = str(f.read())
                                # print(current_file)

                                # ignoring .c files having more than 10,000 lines of code
                                if content.count("\\n") <= 10_000:

                                    try:
                                        start_cursor = index.parse(file_path).cursor

                                        extracted_samples = []
                                        get_binary_expressions(start_cursor, extracted_samples)

                                        for sample in extracted_samples:
                                            # positive_sample = buggy sample, negative_sample = non-buggy sample 
                                            positive_sample = " ".join(sample["buggy_sample"])
                                            negative_sample = " ".join(sample["expression"])
                                            filepath = f.name.split("/AI/MinorProject/c-corpus/")[1]
                                            dataset.write(f"\n{negative_sample}\t{filepath}\t{sample['method']}\t0")
                                            dataset.write(f"\n{positive_sample}\t{filepath}\t{sample['method']}\t1")

                                            total_samples += 2

                                    except:
                                            print("---Error occurred---")

                        current_file = file_path

                        if total_files % 1000 == 0:
                            print("Total files:", total_files, ",", "Total samples:", total_samples)

In [None]:
root_dir = '/home/dipu/Documents/AI/MinorProject/c-corpus/'

generate_binary_operator_expression_dataset(root_dir)

---
## Dataset testing
---

In [118]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [119]:
df = pd.read_csv("operator_precedence_bug_dataset.csv", sep="\t")

In [120]:
df

Unnamed: 0,operator_expression,file_path,method,label
0,( index + 1 ) % 4,cleaned/meridian59/club/util.c,parenthsis_removal,0
1,index + 1 % 4,cleaned/meridian59/club/util.c,parenthsis_removal,1
2,( 1 << FIX_DECIMAL ) * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
3,1 << FIX_DECIMAL * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,1
4,( 1 << FIX_DECIMAL ) * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
...,...,...,...,...
2987025,( cz < ( 2 || cz ) ) > 3,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1
2987026,tzsign * ( 3600 * tzhh + 60 * tzmm + tzss ),cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_removal,0
2987027,tzsign * 3600 * tzhh + 60 * tzmm + tzss,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_removal,1
2987028,tzsign * ( 3600 * tzhh + 60 * tzmm + tzss ),cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,0


In [121]:
# rename column from "label" to "labels"
df = df.rename(columns={"label": "labels"})

In [122]:
df

Unnamed: 0,operator_expression,file_path,method,labels
0,( index + 1 ) % 4,cleaned/meridian59/club/util.c,parenthsis_removal,0
1,index + 1 % 4,cleaned/meridian59/club/util.c,parenthsis_removal,1
2,( 1 << FIX_DECIMAL ) * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
3,1 << FIX_DECIMAL * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,1
4,( 1 << FIX_DECIMAL ) * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
...,...,...,...,...
2987025,( cz < ( 2 || cz ) ) > 3,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1
2987026,tzsign * ( 3600 * tzhh + 60 * tzmm + tzss ),cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_removal,0
2987027,tzsign * 3600 * tzhh + 60 * tzmm + tzss,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_removal,1
2987028,tzsign * ( 3600 * tzhh + 60 * tzmm + tzss ),cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,0


In [123]:
df.isnull().sum()

operator_expression    0
file_path              0
method                 0
labels                 0
dtype: int64

In [124]:
df["method"].value_counts()

parenthsis_insertion    1706628
parenthsis_removal      1280402
Name: method, dtype: int64

In [125]:
df["operator_expression"].nunique()

634961

In [126]:
df = df.drop_duplicates(subset=['operator_expression'])
# Resetting the index
df = df.reset_index(drop=True)

In [127]:
df

Unnamed: 0,operator_expression,file_path,method,labels
0,( index + 1 ) % 4,cleaned/meridian59/club/util.c,parenthsis_removal,0
1,index + 1 % 4,cleaned/meridian59/club/util.c,parenthsis_removal,1
2,( 1 << FIX_DECIMAL ) * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
3,1 << FIX_DECIMAL * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,1
4,( ( int ) ( ( OFFSCREEN_BITMAP_SIZE - x - 1 ) ...,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
...,...,...,...,...
634956,v * 10 + ( double ) * str,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,0
634957,v * ( 10 + ( double ) * str ),cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1
634958,- ( ( hundredths + seconds + minutes * 60 + ho...,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1
634959,( hundredths + ( minutes * ( 60 + hours ) ) ) ...,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1


In [128]:
def count_words(text):
    words = text.split()
    return len(words)

df["word_count"] = df["operator_expression"].apply(count_words)

df["word_count"].mean()

14.055056924756009

In [129]:
(df["word_count"] > 100).sum()

1016

In [130]:
"""Remove operator expression containing more than 100 words"""

df["word_count"] = df["operator_expression"].apply(count_words)

# Filter rows where word count is less than or equal to 100
df_filtered = df[df["word_count"] <= 100]

# Drop the 'word_count' column as it's no longer needed
df_filtered = df_filtered.drop(columns=['word_count'])

In [131]:
df_filtered = df_filtered.reset_index(drop=True)
df_filtered

Unnamed: 0,operator_expression,file_path,method,labels
0,( index + 1 ) % 4,cleaned/meridian59/club/util.c,parenthsis_removal,0
1,index + 1 % 4,cleaned/meridian59/club/util.c,parenthsis_removal,1
2,( 1 << FIX_DECIMAL ) * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
3,1 << FIX_DECIMAL * shrink,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,1
4,( ( int ) ( ( OFFSCREEN_BITMAP_SIZE - x - 1 ) ...,cleaned/meridian59/bbgun/draw.c,parenthsis_removal,0
...,...,...,...,...
633940,v * 10 + ( double ) * str,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,0
633941,v * ( 10 + ( double ) * str ),cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1
633942,- ( ( hundredths + seconds + minutes * 60 + ho...,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1
633943,( hundredths + ( minutes * ( 60 + hours ) ) ) ...,cleaned/notepadthing/build/psycopg2/psycopg/ty...,parenthsis_insertion,1


In [132]:
df_filtered["method"].value_counts()

parenthsis_insertion    419554
parenthsis_removal      214391
Name: method, dtype: int64

In [133]:
df_filtered["labels"].value_counts()

1    381512
0    252433
Name: labels, dtype: int64

In [139]:
"""Splitting train and test set in a way that two pairs of positive and negative samples remain on same set
so that there is no data leakage in test set and the test result remains uninfluenced/accurate"""

unique_file_paths = set(df_filtered['file_path'])

# Splitting the unique_file_paths into train and test sets
train_validation_values, test_values = train_test_split(list(unique_file_paths), test_size=0.1, random_state=40)

# Filtering the original DataFrame based on the train and test values
train_validation_df = df_filtered[df_filtered['file_path'].isin(train_validation_values)]
test_df = df_filtered[df_filtered['file_path'].isin(test_values)]

In [140]:
train_validation_df.size/df_filtered.size, test_df.size/df_filtered.size

(0.9006790809928306, 0.0993209190071694)

In [141]:
"""train and validation set split"""

unique_file_paths = set(train_validation_df['file_path'])

# Splitting the unique_file_paths into train and test sets
train_values, validation_values = train_test_split(list(unique_file_paths), test_size=0.11111111, random_state=42)

# Filtering the original DataFrame based on the train and test values
train_df = train_validation_df[train_validation_df['file_path'].isin(train_values)]
validation_df = train_validation_df[train_validation_df['file_path'].isin(validation_values)]

In [142]:
train_df.size/df_filtered.size, validation_df.size/df_filtered.size

(0.8006909116721482, 0.0999881693206824)

In [143]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df

Unnamed: 0,operator_expression,file_path,method,labels
0,nicvga & 0x10 == 0x10,cleaned/u-boot-imx6/board/mpl/pip405/pip405.c,parenthsis_removal,1
1,t != EOF ( && p ) != EOF,cleaned/cairogles/test/cairo-test.c,parenthsis_insertion,1
2,done == 0 && ( arrcnt == ( 3 || arrcnt ) == 4 ),cleaned/wcb2/remote/remote.c,parenthsis_insertion,1
3,( pc -> stat != 0 && retries ) >= 0,cleaned/los/src/linux/src/drivers/block/ide-cd.c,parenthsis_insertion,1
4,seq * ( 2 - is_ccs ),cleaned/freebsd/crypto/openssl/ssl/d1_both.c,parenthsis_insertion,1
...,...,...,...,...
507589,0.622 * ( vp / ( ( pressure - 0.378 ) * vp ) ),cleaned/vic/src/lakes.eb.c,parenthsis_insertion,1
507590,( length_remaining < length ) + 4,cleaned/gluster-wireshark-1.4/epan/dissectors/...,parenthsis_insertion,1
507591,strlen ( ifn ) == ( sdl -> sdl_nlen && strncmp...,cleaned/freebsd/sbin/natd/natd.c,parenthsis_insertion,1
507592,( ( ( bitmask >> 30 ) & 0x3 ) == ( 0x2 && ( ( ...,cleaned/freebsd/sys/sparc64/sparc64/db_disasm.c,parenthsis_insertion,1


In [144]:
validation_df = validation_df.sample(frac=1).reset_index(drop=True)
validation_df

Unnamed: 0,operator_expression,file_path,method,labels
0,N * h + ++ v,cleaned/uml-auto-assessment/output_comparison_...,parenthsis_insertion,0
1,i < sizeof ( bt ) / sizeof ( uint ),cleaned/asuswrt-merlin/release/src-rt/linux/li...,parenthsis_insertion,0
2,( second_byte >= ( 0x34 && second_byte ) ) <= ...,cleaned/open-watcom/bld/wdisasm/c/docode.c,parenthsis_insertion,1
3,2 * ( j + 5 ),cleaned/fontforge/fonttools/showttf.c,parenthsis_insertion,1
4,! buffer && provider == vws -> pools . gmr_fenced,cleaned/mesa/src/gallium/winsys/svga/drm/vmw_s...,parenthsis_insertion,0
...,...,...,...,...
63382,proto == ptr -> fw_prot && startport <= eport ...,cleaned/miniupnp/miniupnpd/ipfw/ipfwrdr.c,parenthsis_insertion,0
63383,( 2 + 3 ) * i915 -> current . sampler_enable_nr,cleaned/mesa/src/gallium/drivers/i915/i915_sta...,parenthsis_insertion,1
63384,40 * ( val1 + val2 ),cleaned/aolserver/nsssl/x509.c,parenthsis_insertion,1
63385,print_progress && ! quiet && i & 63 == 0,cleaned/haiku/src/libs/print/libgutenprint/tes...,parenthsis_removal,1


In [145]:
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df

Unnamed: 0,operator_expression,file_path,method,labels
0,c -> sending == ( 0 && packetqueue_len ( & c -...,cleaned/contiki/core/net/rime/collect.c,parenthsis_insertion,1
1,dmf > 32 * 64,cleaned/x264-devel/encoder/macroblock.c,parenthsis_insertion,0
2,10 * ( timeval_to_ns ( & tnow ) - timeval_to_n...,cleaned/msm7x30-3.4.x-naa/drivers/gpu/drm/i915...,parenthsis_removal,0
3,repeatptr -> pos1 + ( repeatptr -> len > 0 ),cleaned/genometools/src/ltr/ltrharvest_stream.c,parenthsis_insertion,1
4,pred_count % 2 == 0,cleaned/freebsd/contrib/subversion/subversion/...,parenthsis_insertion,0
...,...,...,...,...
62959,bytestream2_get_bytes_left ( & ctx -> g ) < 25...,cleaned/mythtv/mythtv/external/FFmpeg/libavcod...,parenthsis_insertion,0
62960,( ! leader && ( ! intermed && command ) ) < 0x70,cleaned/rectty/libvterm/bin/vterm-dump.c,parenthsis_insertion,1
62961,( color_remap + ( linelen * ( blueline + 2 ) )...,cleaned/sane-backends/backend/sm3840_scan.c,parenthsis_insertion,1
62962,"sect -> type != ElfSectNobits && map ( obj , s...",cleaned/coutune/compiler/9c/8l/ld/ldelf.c,parenthsis_insertion,0


In [152]:
train_df["labels"].value_counts()

1    305560
0    202034
Name: labels, dtype: int64

In [146]:
train_df.to_csv("train__operator_precedence_bug_full_dataset_preprocessed.tsv", sep="\t", index=False)
validation_df.to_csv("validation__operator_precedence_bug_full_dataset_preprocessed.tsv", sep="\t", index=False)
test_df.to_csv("test__operator_precedence_bug_full_dataset_preprocessed.tsv", sep="\t", index=False)