# Data preprocessing:

### Important words identification

We created a frequency distribution of words to identify the relevant ones on the dataset.

In [15]:
from os import listdir
import pandas

dataset_dir = "collected_code/heal_round_15/healed/"

def extract_tokens() -> list[str]:
    code_files = listdir(dataset_dir)
    all_tokens = []
    for code_file in code_files:
        if (
            code_file.endswith(".cppcheck.txt")
            or code_file.endswith(".flawfinder.txt")
            or code_file.endswith(".gcc.txt")
        ):
            with open(f"{dataset_dir}{code_file}", "r") as open_file:
                wholefile = open_file.read()
                lines = wholefile.split("\n")
                for line in lines:
                    tokens = line.split(" ")
                    all_tokens.extend(tokens)
    return all_tokens


def calculate_frequency_distribution(list: list) -> list[dict]:
    distribution = dict()
    for word in list:
        if word in distribution.keys():
            distribution[word] += 1
        else:
            distribution[word] = 1
    fd = []
    for key in distribution.keys():
        fd.append(
            {
                "word": key,
                "frequency": distribution[key],
            }
        )
    return fd

tokens = extract_tokens()
words_fd = calculate_frequency_distribution(tokens)

words_fd_df = pandas.DataFrame(words_fd)
words_fd_df.to_csv(f'{dataset_dir}word_fd_healed.csv', index=False)
words_fd_df.head()

Unnamed: 0,word,frequency
0,collected_code/heal_round_12/healed/problem-51...,1
1,In,1
2,function,32
3,'tupleToList':,2
4,collected_code/heal_round_12/healed/problem-51...,1


After manual analysis, we have found the following relevant words:

In [16]:
interesting_words = [
    # cppcheck
    "[premium-bughuntingUninitNonConstArg]",
    "[premium-bughuntingUninit]",
    "[uninitvar]",
    "[zerodiv]",
    "[arrayIndexOutOfBounds]",
    "[legacyUninitvar]",
    "[negativeIndex]",
    "[integerOverflow]",
    "[syntaxError]",
    "[invalidFunctionArg]",
    "[uninitStructMember]",
    "[uninitdata]",
    "[overlappingWriteFunction]",
    # gcc
    "[-Wimplicit-function-declaration]",
    "[-Wbuiltin-declaration-mismatch]",
    "[-Wanalyzer-possible-null-dereference]",    # [CWE-690]
    "[-Wincompatible-pointer-types]",
    "[-Wsizeof-array-argument]",
    "[-Wanalyzer-null-dereference]",             # [CWE-476]
    "[-Wanalyzer-malloc-leak]",                  # [CWE-401]
    "[-Wanalyzer-write-to-string-literal]",
    "[-Wanalyzer-out-of-bounds]",                # [CWE-126]
    "[-Wanalyzer-use-of-uninitialized-value]",   # [CWE-457]
    "[-Wint-conversion]",
    "[-Wanalyzer-out-of-bounds]",                # [CWE-127]
    "[-Wanalyzer-null-argument]",                # [CWE-476]
    "[-Wreturn-local-addr]",
    "[-Wanalyzer-free-of-non-heap]",             # [CWE-590]
    # flawfinder
    "(CWE-120, CWE-20)",
    "(CWE-126)",
    "(CWE-120)",
    "(CWE-190)",
]

### CWE Classification

Then, we analyzed the words to check to which CWE's they should be classified:

In [17]:
cwe_mapping = {
    20: [  # https://cwe.mitre.org/data/definitions/20.html
        "(CWE-120, CWE-20).",
    ],
    119: [  # https://cwe.mitre.org/data/definitions/119.html
        "[arrayIndexOutOfBounds]",
        "(CWE-119!/CWE-120).",
    ],
    120: [  # https://cwe.mitre.org/data/definitions/120.html
        "(CWE-120, CWE-20)",
        "(CWE-119!/CWE-120)",
        "(CWE-120)",
    ],
    123: [ # https://cwe.mitre.org/data/definitions/123.html
        "[-Wanalyzer-write-to-string-literal]", # might also be mapped as CWE-134 maybe.
    ],
    124: [  # https://cwe.mitre.org/data/definitions/124.html
        "[CWE-124]",
    ],
    126: [  # https://cwe.mitre.org/data/definitions/126.html
        "(CWE-126)",
        "[-Wanalyzer-out-of-bounds]",
    ],
    127: [  # https://cwe.mitre.org/data/definitions/127.html
        "[-Wanalyzer-out-of-bounds]",
    ],
    129: [  # https://cwe.mitre.org/data/definitions/129.html
        "[negativeIndex]",
    ],
    190: [  # https://cwe.mitre.org/data/definitions/190.html
        "[integerOverflow]",
        "(CWE-190)",
    ],
    369: [  # https://cwe.mitre.org/data/definitions/369.html
        "[zerodiv]",
    ],
    401: [  # https://cwe.mitre.org/data/definitions/401.html
        "[-Wanalyzer-malloc-leak]",
        "[memleak]",
    ],
    457: [  # https://cwe.mitre.org/data/definitions/457.html
        "[uninitvar]",
        "[legacyUninitvar]",
        "[uninitStructMember]",
        "[-Wanalyzer-use-of-uninitialized-value]",
    ],
    467: [  # https://cwe.mitre.org/data/definitions/467.html
        "[-Wsizeof-array-argument]",
    ],
    476: [  # https://cwe.mitre.org/data/definitions/476.html
        "[-Wanalyzer-null-dereference]",
        "[-Wanalyzer-null-argument]",
    ],
    562: [  # https://cwe.mitre.org/data/definitions/562.html
        "[-Wreturn-local-addr]",
    ],
    590: [  # https://cwe.mitre.org/data/definitions/590.html
        "[-Wanalyzer-free-of-non-heap]",
    ],
    628: [  # https://cwe.mitre.org/data/definitions/628.html
        "[invalidFunctionArg]",
    ],
    686: [  # https://cwe.mitre.org/data/definitions/686.html
        "[-Wimplicit-function-declaration]",
        "[-Wbuiltin-declaration-mismatch]",
    ],
    690: [  # https://cwe.mitre.org/data/definitions/690.html
        "[-Wanalyzer-possible-null-dereference]",
    ],
    704: [  # https://cwe.mitre.org/data/definitions/704.html
        "[-Wint-conversion]",
    ],
    843: [  # https://cwe.mitre.org/data/definitions/843.html
        "[-Wincompatible-pointer-types]",
    ],
    908: [  # https://cwe.mitre.org/data/definitions/908.html
        "[uninitdata]",
    ],
    1260: [  # https://cwe.mitre.org/data/definitions/1260.html
        "[overlappingWriteFunction]",
    ],
}



values = []
[values.extend(x) for x in cwe_mapping.values()]
for i in interesting_words:
    if i not in values:
        print(i)

[premium-bughuntingUninitNonConstArg]
[premium-bughuntingUninit]
[syntaxError]


With the proper words identified, we need to check which words ocurred in which lines of which tool logs:

In [18]:
from os import listdir
import re

def extract_flawfinder_ocurrences(input: str, filename: str) -> list[dict[str, str | int]]:
    output = list()
    results_block = input.split("RESULTS:")[1].split("ANALYSIS")[0]
    if results_block.strip() == "":
        return output
    errors = results_block.replace("\n","").split("collected")
    for error in errors:
        if "(CWE-" in error:
            output.append({
                "tool": filename.split(".")[1],
                "token": re.findall(r"(\(CWE-.*?\))", error)[0],
                "file": filename,
                "problem_number": int(filename.split("-")[1].split(".")[0]),
                "line_number": int(error.split(":")[1]),
            })
    return output

def extract_gcc_cppcheck_ocurrences(input: str, filename: str) -> list[dict[str, str | int]]:
    output = list()
    for line in input.split("\n"):
        for word in interesting_words:
            if word in line:
                output.append({
                    "tool": filename.split(".")[1],
                    "token": word,
                    "file": filename,
                    "problem_number": int(filename.split("-")[1].split(".")[0]),
                    "line_number": int(line.split(":")[1]),
                })
    return output


def count_ocurrences() -> list[dict]:
    code_files = listdir(dataset_dir)
    ocurrences = list()
    for code_file in code_files:
        if (
            code_file.endswith(".cppcheck.txt")
            or code_file.endswith(".flawfinder.txt")
            or code_file.endswith(".gcc.txt")
        ):
            with open(f"{dataset_dir}{code_file}", "r") as open_file:
                wholefile = open_file.read()
                if code_file.endswith(".flawfinder.txt"):
                    ocurrences.extend(extract_flawfinder_ocurrences(wholefile, code_file))
                else:
                    ocurrences.extend(extract_gcc_cppcheck_ocurrences(wholefile, code_file))
    return ocurrences

ocurrences = count_ocurrences()
ocurrences_df = pandas.DataFrame(ocurrences)
ocurrences_df.to_csv(f'{dataset_dir}word_ocurrences_healed.csv', index=False)
ocurrences_df.head()

Unnamed: 0,tool,token,file,problem_number,line_number
0,gcc,[-Wanalyzer-malloc-leak],problem-513.gcc.txt,513,12
1,flawfinder,(CWE-120),problem-973.flawfinder.txt,973,21
2,flawfinder,(CWE-120),problem-973.flawfinder.txt,973,22
3,flawfinder,(CWE-126),problem-973.flawfinder.txt,973,10
4,flawfinder,(CWE-120),problem-973.flawfinder.txt,973,23


### Dataset building

then we want to build a frequency distribution of the ocurrences by problem, CWE and tool:

In [19]:
def fd_cwes_by_problem(ocurrences: list[dict]) -> list[dict]:
    fd_dict = {}
    for ocurrence in ocurrences:
        for key in cwe_mapping.keys():
            if ocurrence["token"] in cwe_mapping[key]:
                pn = ocurrence["problem_number"]
                tool = ocurrence["tool"]
                if f"{pn}:{key}:{tool}" in fd_dict.keys():
                    fd_dict[f"{pn}:{key}:{tool}"] += 1
                else:
                    fd_dict[f"{pn}:{key}:{tool}"] = 1
    fd = list()
    for key in fd_dict.keys():
        key = str(key)  # for better syntax highlighting
        fd.append(
            {
                "problem": key.split(":")[0],
                "cwe": key.split(":")[1],
                "tool": key.split(":")[2],
                "frequency": fd_dict[key],
            }
        )
    return fd

fd = fd_cwes_by_problem(ocurrences)
fd_df = pandas.DataFrame(fd)
fd_df.to_csv(f'{dataset_dir}cwes_by_problem_healed.csv', index=False)
fd_df.head()

Unnamed: 0,problem,cwe,tool,frequency
0,513,401,gcc,1
1,973,120,flawfinder,3
2,973,126,flawfinder,1
3,377,120,flawfinder,2
4,377,126,flawfinder,1


Another interesting view is to see CWEs by their pillars

In [20]:
from cwe2.database import Database
from cwe2.weakness import Weakness

def fd_pillars_by_problem(ocurrences: list[dict]) -> list[dict]:
    fd_dict = {}
    for ocurrence in ocurrences:
        for key in cwe_mapping.keys():
            if ocurrence["token"] in cwe_mapping[key]:
                pn = ocurrence["problem_number"]
                tool = ocurrence["tool"]
                if f"{pn}:{key}:{tool}" in fd_dict.keys():
                    fd_dict[f"{pn}:{key}:{tool}"] += 1
                else:
                    fd_dict[f"{pn}:{key}:{tool}"] = 1
    fd = list()
    for key in fd_dict.keys():
        key = str(key)  # for better syntax highlighting
        pillar = get_cwe_pillar(int(key.split(":")[1]))
        fd.append(
            {
                "problem": key.split(":")[0],
                "cwe": pillar,
                "tool": key.split(":")[2],
                "frequency": fd_dict[key],
            }
        )
    return fd

def get_cwe_pillar(cwe: int):
    db = Database()
    current_cwe = db.get(cwe)
    current_cwe_father = find_father(current_cwe)
    while(current_cwe_father):
        current_cwe = db.get(current_cwe_father)
        current_cwe_father = find_father(current_cwe)
    return current_cwe.cwe_id


def find_father(cwe: Weakness):
    for related in cwe.related_weaknesses.split("::"):
        if related == '':
            continue
        related_parts = related.split(":")
        if related_parts[1] == 'ChildOf' and related_parts[5] == '1000':
            return int(related_parts[3])
    return None

pillar_fd = fd_pillars_by_problem(ocurrences)
pillar_fd_df = pandas.DataFrame(pillar_fd)
pillar_fd_df.to_csv(f'{dataset_dir}cwes_by_pillar_healed.csv', index=False)
pillar_fd_df.head()

Unnamed: 0,problem,cwe,tool,frequency
0,513,664,gcc,1
1,973,664,flawfinder,3
2,973,664,flawfinder,1
3,377,664,flawfinder,2
4,377,664,flawfinder,1


In [21]:
pillar_fd
exploded_fd = list[dict]()
fields_set = set()
for item in pillar_fd:
    new_item = {}
    new_item['problem'] = item['problem']
    for field in item.keys():
        if field not in ('problem', 'frequency'):
            fields_set.add(f'{field}_{item[field]}')
            new_item[f'{field}_{item[field]}'] = item['frequency']
    exploded_fd.append(new_item)
for item in exploded_fd:
    for field in fields_set:
        if field not in item.keys():
            item[field] = 0
exploded_fd_df = pandas.DataFrame(exploded_fd)
exploded_fd_df.to_csv(f'{dataset_dir}exploded_cwes_tools_healed.csv', index=False)