# Data preprocessing:

### Important words identification

We created a frequency distribution of words to identify the relevant ones on the dataset.

In [2]:
from os import listdir
import pandas

dataset_dir = "collected_code/"

def extract_tokens() -> list[str]:
    code_files = listdir(dataset_dir)
    all_tokens = []
    for code_file in code_files:
        if (
            code_file.endswith(".cppcheck.txt")
            or code_file.endswith(".flawfinder.txt")
            or code_file.endswith(".gcc.txt")
        ):
            with open(f"{dataset_dir}{code_file}", "r") as open_file:
                wholefile = open_file.read()
                lines = wholefile.split("\n")
                for line in lines:
                    tokens = line.split(" ")
                    all_tokens.extend(tokens)
    return all_tokens


def calculate_frequency_distribution(list: list) -> list[dict]:
    distribution = dict()
    for word in list:
        if word in distribution.keys():
            distribution[word] += 1
        else:
            distribution[word] = 1
    fd = []
    for key in distribution.keys():
        fd.append(
            {
                "word": key,
                "frequency": distribution[key],
            }
        )
    return fd

tokens = extract_tokens()
words_fd = calculate_frequency_distribution(tokens)

words_fd_df = pandas.DataFrame(words_fd)
words_fd_df.to_csv('word_fd.csv', index=False)
words_fd_df.head()

Unnamed: 0,word,frequency
0,Checking,152
1,collected_code/problem-493-0.c,2
2,...,152
3,,285840
4,collected_code/problem-867-0.c:6:19:,1


After manual analysis, we have found the following relevant words:

In [3]:
interesting_words = [
    "(CWE-119!/CWE-120).",  #
    "(CWE-120, CWE-20).",  #
    "(CWE-120).",  #
    "(CWE-126).",  #
    "(CWE-190).",  #
    "[-Wanalyzer-write-to-string-literal]",
    "[-Wbuiltin-declaration-mismatch]",  #
    "[-Wimplicit-function-declaration]",  #
    "[-Wincompatible-pointer-types]",  #
    "[-Wint-conversion]",  #
    "[-Wreturn-local-addr]",  #
    "[-Wsizeof-array-argument]",  #
    "[arrayIndexOutOfBounds]",  #
    "[CWE-124]",  #
    "[CWE-126]",  #
    "[CWE-127]",  #
    "[CWE-401]",  #
    "[CWE-457]",  #
    "[CWE-476]",  #
    "[CWE-590]",  #
    "[CWE-690]",  #
    "[integerOverflow]",  #
    "[invalidFunctionArg]",  #
    "[legacyUninitvar]",  #
    "[negativeIndex]",  #
    "[overlappingWriteFunction]",  #
    "[premium-bughuntingUninit]",
    "[premium-bughuntingUninitNonConstArg]",
    "[syntaxError]",
    "[uninitdata]",  #
    "[uninitStructMember]",  #
    "[uninitvar]",  #
    "[zerodiv]",  #
]

### CWE Classification

Then, we analyzed the words to check to which CWE's they should be classified:

In [4]:
cwe_mapping = {
    20: [  # https://cwe.mitre.org/data/definitions/20.html
        "(CWE-120, CWE-20).",
    ],
    119: [  # https://cwe.mitre.org/data/definitions/119.html
        "[arrayIndexOutOfBounds]",
        "(CWE-119!/CWE-120).",
    ],
    120: [  # https://cwe.mitre.org/data/definitions/120.html
        "(CWE-120, CWE-20).",
        "(CWE-119!/CWE-120).",
    ],
    124: [  # https://cwe.mitre.org/data/definitions/124.html
        "[CWE-124]",
    ],
    126: [  # https://cwe.mitre.org/data/definitions/126.html
        "[CWE-126]",
        "(CWE-126).",
    ],
    127: [  # https://cwe.mitre.org/data/definitions/127.html
        "[CWE-127]",
    ],
    129: [  # https://cwe.mitre.org/data/definitions/129.html
        "[negativeIndex]",
    ],
    190: [  # https://cwe.mitre.org/data/definitions/190.html
        "[integerOverflow]",
        "(CWE-190).",
    ],
    369: [  # https://cwe.mitre.org/data/definitions/369.html
        "[zerodiv]",
    ],
    401: [  # https://cwe.mitre.org/data/definitions/401.html
        "[CWE-401]",
    ],
    457: [  # https://cwe.mitre.org/data/definitions/457.html
        "[uninitvar]",
        "[legacyUninitvar]",
        "[uninitStructMember]",
        "[legacyUninitvar]",
        "[CWE-457]",
    ],
    467: [  # https://cwe.mitre.org/data/definitions/467.html
        "[-Wsizeof-array-argument]",
    ],
    476: [  # https://cwe.mitre.org/data/definitions/476.html
        "[CWE-476]",
    ],
    562: [  # https://cwe.mitre.org/data/definitions/562.html
        "[-Wreturn-local-addr]",
    ],
    590: [  # https://cwe.mitre.org/data/definitions/590.html
        "[CWE-590]",
    ],
    628: [  # https://cwe.mitre.org/data/definitions/628.html
        "[invalidFunctionArg]",
    ],
    686: [  # https://cwe.mitre.org/data/definitions/686.html
        "[-Wimplicit-function-declaration]",
        "[-Wbuiltin-declaration-mismatch]",
    ],
    690: [  # https://cwe.mitre.org/data/definitions/690.html
        "[CWE-690]",
    ],
    704: [  # https://cwe.mitre.org/data/definitions/704.html
        "[-Wint-conversion]",
    ],
    843: [  # https://cwe.mitre.org/data/definitions/843.html
        "[-Wincompatible-pointer-types]",
    ],
    908: [  # https://cwe.mitre.org/data/definitions/908.html
        "[uninitdata]",
    ],
    1260: [  # https://cwe.mitre.org/data/definitions/1260.html
        "[overlappingWriteFunction]",
    ],
}


With the proper words identified, we need to check which words ocurred in which lines of which tool logs:

In [5]:
def count_ocurrences() -> list[dict]:
    code_files = listdir(dataset_dir)
    ocurrences = list()
    for code_file in code_files:
        if (
            code_file.endswith(".cppcheck.txt")
            or code_file.endswith(".flawfinder.txt")
            or code_file.endswith(".gcc.txt")
        ):
            with open(f"{dataset_dir}{code_file}", "r") as open_file:
                wholefile = open_file.read()
                problem_number = code_file.split("-")[1].split(".")[0]
                line_number = 0
                for line in wholefile.splitlines():
                    line_number += 1
                    for word in interesting_words:
                        if word in line:
                            ocurrence = {
                                "tool": code_file.split(".")[1],
                                "token": word,
                                "file": code_file,
                                "problem_number": problem_number,
                                "line_number": line_number,
                            }
                            ocurrences.append(ocurrence)
    return ocurrences

ocurrences = count_ocurrences()
ocurrences_df = pandas.DataFrame(ocurrences)
ocurrences_df.to_csv('word_ocurrences.csv', index=False)
ocurrences_df.head()

Unnamed: 0,tool,token,file,problem_number,line_number
0,cppcheck,[premium-bughuntingUninitNonConstArg],problem-867.cppcheck.txt,867,1
1,cppcheck,[premium-bughuntingUninitNonConstArg],problem-832.cppcheck.txt,832,1
2,cppcheck,[premium-bughuntingUninit],problem-832.cppcheck.txt,832,4
3,cppcheck,[premium-bughuntingUninitNonConstArg],problem-439.cppcheck.txt,439,1
4,cppcheck,[premium-bughuntingUninitNonConstArg],problem-898.cppcheck.txt,898,1


### Dataset building

then we want to build a frequency distribution of the ocurrences by problem, CWE and tool:

In [6]:
def fd_cwes_by_problem(ocurrences: list[dict]) -> list[dict]:
    fd_dict = {}
    for ocurrence in ocurrences:
        for key in cwe_mapping.keys():
            if ocurrence["token"] in cwe_mapping[key]:
                pn = ocurrence["problem_number"]
                tool = ocurrence["tool"]
                if f"{pn}:{key}:{tool}" in fd_dict.keys():
                    fd_dict[f"{pn}:{key}:{tool}"] += 1
                else:
                    fd_dict[f"{pn}:{key}:{tool}"] = 1
    fd = list()
    for key in fd_dict.keys():
        key = str(key)  # for better syntax highlighting
        fd.append(
            {
                "problem": key.split(":")[0],
                "cwe": key.split(":")[1],
                "tool": key.split(":")[2],
                "frequency": fd_dict[key],
            }
        )
    return fd

fd = fd_cwes_by_problem(ocurrences)
fd_df = pandas.DataFrame(fd)
fd_df.to_csv('k-mean-clustring/cwes_by_problem.csv', index=False)
fd_df.head()

Unnamed: 0,problem,cwe,tool,frequency
0,158,686,gcc,2
1,973,126,flawfinder,1
2,900,686,gcc,1
3,674,457,cppcheck,1
4,972,690,gcc,3


Another interesting view is to see CWEs by their pillars

In [7]:
from cwe2.database import Database
from cwe2.weakness import Weakness

def fd_pillars_by_problem(ocurrences: list[dict]) -> list[dict]:
    fd_dict = {}
    for ocurrence in ocurrences:
        for key in cwe_mapping.keys():
            if ocurrence["token"] in cwe_mapping[key]:
                pn = ocurrence["problem_number"]
                tool = ocurrence["tool"]
                if f"{pn}:{key}:{tool}" in fd_dict.keys():
                    fd_dict[f"{pn}:{key}:{tool}"] += 1
                else:
                    fd_dict[f"{pn}:{key}:{tool}"] = 1
    fd = list()
    for key in fd_dict.keys():
        key = str(key)  # for better syntax highlighting
        pillar = get_cwe_pillar(int(key.split(":")[1]))
        fd.append(
            {
                "problem": key.split(":")[0],
                "cwe": pillar,
                "tool": key.split(":")[2],
                "frequency": fd_dict[key],
            }
        )
    return fd

def get_cwe_pillar(cwe: int):
    db = Database()
    current_cwe = db.get(cwe)
    current_cwe_father = find_father(current_cwe)
    while(current_cwe_father):
        current_cwe = db.get(current_cwe_father)
        current_cwe_father = find_father(current_cwe)
    return current_cwe.cwe_id


def find_father(cwe: Weakness):
    for related in cwe.related_weaknesses.split("::"):
        if related == '':
            continue
        related_parts = related.split(":")
        if related_parts[1] == 'ChildOf' and related_parts[5] == '1000':
            return int(related_parts[3])
    return None

pillar_fd = fd_pillars_by_problem(ocurrences)
pillar_fd_df = pandas.DataFrame(pillar_fd)
pillar_fd_df.to_csv('cwes_by_pillar.csv', index=False)
pillar_fd_df.head()

Unnamed: 0,problem,cwe,tool,frequency
0,158,710,gcc,2
1,973,664,flawfinder,1
2,900,710,gcc,1
3,674,664,cppcheck,1
4,972,710,gcc,3


In [8]:
pillar_fd
exploded_fd = list[dict]()
fields_set = set()
for item in pillar_fd:
    new_item = {}
    new_item['problem'] = item['problem']
    for field in item.keys():
        if field not in ('problem', 'frequency'):
            fields_set.add(f'{field}_{item[field]}')
            new_item[f'{field}_{item[field]}'] = item['frequency']
    exploded_fd.append(new_item)
for item in exploded_fd:
    for field in fields_set:
        if field not in item.keys():
            item[field] = 0
exploded_fd_df = pandas.DataFrame(exploded_fd)
exploded_fd_df.to_csv('exploded_cwes_tools.csv', index=False)