<a href="https://colab.research.google.com/github/emanuelebrizzi/bootcamp/blob/main/code_vulnerability_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [169]:
#from google.colab import files
#import zipfile
#import os

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Dataset upload**


In [170]:
import os
import glob

# Directory path
drive_path = "/content/drive/MyDrive/bootcamp_file"

libpng_path = os.path.join(drive_path, "LibPNG")

vuln_path = os.path.join(libpng_path, "Vulnerable_functions")
non_vuln_path = os.path.join(libpng_path, "Non_vulnerable_functions")

In [171]:

def load_files_from_folder(folder, label):
    files = glob.glob(os.path.join(folder, "*.c"))
    data = []
    for file in files:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            data.append((f.read(), label, os.path.basename(file)))
    return data


# **Replace Function names and Variable names**

In [172]:
import re
import random
import string
from pathlib import Path

In [173]:
def remove_comments(code):
    """Rimuove i commenti dal codice C."""
    code = re.sub(r'//.*', '', code)  # Rimuove commenti su singola riga
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # Rimuove commenti multi-linea
    return code

def generate_random_name(length=8):
    """Genera un nome casuale per variabili e funzioni."""
    return ''.join(random.choices(string.ascii_letters, k=length))

def extract_identifiers(code):
    """Estrae nomi di variabili e funzioni dal codice C."""
    pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b')
    keywords = set(['int', 'char', 'float', 'double', 'return', 'if', 'else', 'while', 'for', 'do', 'switch', 'case', 'void'])
    identifiers = set(pattern.findall(code)) - keywords
    return identifiers

def categorize_identifiers(identifiers):
    """Classifica gli identificatori come variabili o funzioni."""
    var_counter, fun_counter = 1, 1
    replacements = {}
    for identifier in identifiers:
        if re.search(r'\b[A-Za-z_][A-Za-z0-9_]*\s*\(', identifier):
            replacements[identifier] = f'fun_{fun_counter}'
            fun_counter += 1
        else:
            replacements[identifier] = f'var_{var_counter}'
            var_counter += 1
    return replacements

def replace_identifiers(code, replacements):
    """Sostituisce i nomi di variabili e funzioni con nuovi nomi."""
    for old, new in replacements.items():
        code = re.sub(r'\b' + re.escape(old) + r'\b', new, code)
    return code

def process_c_file(filepath, new_filepath):
    """Legge un file C, sostituisce i nomi e salva il nuovo file."""
    with open(filepath, 'r', encoding='utf-8') as file:
        code = file.read()

    code_no_comments = remove_comments(code)
    identifiers = extract_identifiers(code_no_comments)
    replacements = categorize_identifiers(identifiers)
    new_code = replace_identifiers(code_no_comments, replacements)
    with open(new_filepath, 'w', encoding='utf-8') as file:
        file.write(new_code)

def process_directory(directory, new_directory_path):
    """Processa tutti i file C in una directory."""
    for filepath in Path(directory).glob("*.c"):
        new_filepath = os.path.join(new_directory_path, os.path.basename(filepath))
        process_c_file(filepath, new_filepath)

Execution:

In [179]:
import shutil

modified_vuln_path = os.path.join(vuln_path, "modified_files")

if os.path.exists(modified_vuln_path):
  shutil.rmtree(modified_vuln_path)
os.makedirs(modified_vuln_path, exist_ok=True)
process_directory(vuln_path, modified_vuln_path)


modified_non_vuln_path = os.path.join(non_vuln_path, "modified_files")

if os.path.exists(modified_non_vuln_path):
  shutil.rmtree(modified_non_vuln_path)
os.makedirs(modified_non_vuln_path, exist_ok=True)
process_directory(non_vuln_path, modified_non_vuln_path)

# **Tokenization**

In [183]:
from pygments.lexers import CppLexer
from pygments.token import Token
from pygments import lex

def tokenize_c_code(code):
    """
    Tokenizes C code using Pygments and returns a list of token types.
    """
    lexer = CppLexer()
    tokens = lex(code, lexer)

    token_list = []
    for ttype, value in tokens:
        if ttype in Token.Text or ttype in Token.Comment:
            continue  # Ignore whitespace and comments
        token_list.append(value)

    return token_list

['int', 'main', '(', ')', '{', 'int', 'a', '=', '10', ';', 'printf', '(', '"', 'Hello, World!', '"', ')', ';', 'return', '0', ';', '}']


In [None]:

# Exemple
tokens = tokenize_c_code(c_code)
print(tokens)