In [29]:
import csv
import json
from pathlib import Path

In [30]:
directory = Path('codeql-results')
vuln_set = set()
result_dict = dict()
for file in directory.iterdir():
    vuln_dict = dict()
    total = 0
    with open(file, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            vuln_type = row[0].strip()
            if vuln_type not in vuln_dict:
                vuln_dict[vuln_type] = 1
            else:
                vuln_dict[vuln_type] += 1

            vuln_set.add(vuln_type)
            total += 1

    vuln_dict["Total"] = total
    result_dict[file.stem] = vuln_dict

In [31]:
with open("analysis/vuln_analysis.json", "w") as f:
    f.write(json.dumps(result_dict, indent=4))

In [32]:
vuln_set

{'Bad HTML filtering regexp',
 'Clear-text logging of sensitive information',
 'Cross-site scripting via HTML template escaping bypass',
 'Flask app is run in debug mode',
 'Incomplete URL scheme check',
 'Incomplete URL substring sanitization',
 'Incomplete regular expression for hostnames',
 'Incorrect conversion between integer types',
 'Inefficient regular expression',
 'Information exposure through an exception',
 'Missing regular expression anchor',
 'Overly permissive regular expression range',
 'Potentially unsafe quoting',
 'Reflected cross-site scripting',
 'Suspicious characters in a regular expression',
 'Uncontrolled data used in path expression',
 'Use of a broken or weak cryptographic hashing algorithm on sensitive data',
 'Use of insufficient randomness as the key of a cryptographic algorithm'}

In [33]:
vulnerabilities = ['Bad HTML filtering regexp',
 'Clear-text logging of sensitive information',
 'Cross-site scripting via HTML template escaping bypass',
 'Flask app is run in debug mode',
 'Incomplete URL scheme check',
 'Incomplete URL substring sanitization',
 'Incomplete regular expression for hostnames',
 'Incorrect conversion between integer types',
 'Inefficient regular expression',
 'Information exposure through an exception',
 'Missing regular expression anchor',
 'Overly permissive regular expression range',
 'Potentially unsafe quoting',
 'Reflected cross-site scripting',
 'Suspicious characters in a regular expression',
 'Uncontrolled data used in path expression',
 'Use of a broken or weak cryptographic hashing algorithm on sensitive data',
 'Use of insufficient randomness as the key of a cryptographic algorithm']

with open("analysis/vuln_analysis.json", "r") as f, open("analysis/full_vuln.json", "w") as out:
    data = json.load(f)
    for k, v in data.items():
        for vuln in vulnerabilities:
            if vuln not in v:
                data[k][vuln] = 0

    out.write(json.dumps(data, indent=4))

In [39]:
import pandas as pd
import json

with open("analysis/full_vuln.json", "r") as f:
    data = json.load(f)
    train = pd.DataFrame.from_dict(data, orient='index')

In [41]:
train.to_csv("analysis/vuln.csv")

In [44]:
import pandas as pd

df = pd.read_csv("analysis/vuln.csv")

# Mapping from vuln -> category
category_map = {
    # Information Disclosure / Data Handling
    "Clear-text logging of sensitive information": "Information Disclosure / Data Handling",
    "Information exposure through an exception": "Information Disclosure / Data Handling",


    # Input Validation & Injection
    "Bad HTML filtering regexp": "Input Validation & Injection",
    "Cross-site scripting via HTML template escaping bypass": "Input Validation & Injection",
    "Reflected cross-site scripting": "Input Validation & Injection",
    "Suspicious characters in a regular expression": "Input Validation & Injection",
    "Incomplete URL scheme check": "Input Validation & Injection",
    "Incomplete URL substring sanitization": "Input Validation & Injection",
    "Missing regular expression anchor": "Input Validation & Injection",
    "Overly permissive regular expression range": "Input Validation & Injection",
    "Uncontrolled data used in path expression": "Input Validation & Injection",


    # Cryptography & Randomness
    "Use of a broken or weak cryptographic hashing algorithm on sensitive data": "Cryptography & Randomness",
    "Use of insufficient randomness as the key of a cryptographic algorithm": "Cryptography & Randomness",


    # Code Quality & Misconfiguration
    "Flask app is run in debug mode": "Code Quality & Misconfiguration",
    "Incorrect conversion between integer types": "Code Quality & Misconfiguration",
    "Inefficient regular expression": "Code Quality & Misconfiguration",
    "Potentially unsafe quoting": "Code Quality & Misconfiguration",
}


# Melt wide table into long format for grouping
df_melted = df.melt(id_vars=["Model Name","Temp","Language"],
var_name="Vulnerability", value_name="Count")


# Map each vulnerability to a category
df_melted["Category"] = df_melted["Vulnerability"].map(category_map)


# Group and sum by model/temp/lang/category
df_grouped = df_melted.groupby(["Model Name","Temp","Language","Category"], as_index=False)["Count"].sum()


# Pivot back to wide format
df_summary = df_grouped.pivot_table(index=["Model Name","Temp","Language"],
columns="Category", values="Count", fill_value=0).reset_index()


# Show final summary
df_summary.head()
# print(df_summary)

# Save to CSV
df_summary.to_csv("vulnerability_categories_summary.csv", index=False)