<a href="https://colab.research.google.com/github/emanuelebrizzi/bootcamp/blob/main/code_vulnerability_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [18]:
#from google.colab import files
#import zipfile
#import os

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Dataset upload**


In [19]:
import os
import glob

# Directory path
drive_path = "/content/drive/MyDrive/bootcamp_file"

libpng_path = os.path.join(drive_path, "LibPNG")

vuln_path = os.path.join(libpng_path, "Vulnerable_functions")
non_vuln_path = os.path.join(libpng_path, "Non_vulnerable_functions")

In [20]:

def load_files_from_folder(folder, label):
    files = glob.glob(os.path.join(folder, "*.c"))
    data = []
    for file in files:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            data.append((f.read(), label, os.path.basename(file)))
    return data


# **Replace Function names and Variable names**

In [21]:
import re
import random
import string
from pathlib import Path

In [22]:
def remove_comments(code):
    """Rimuove i commenti dal codice C."""
    code = re.sub(r'//.*', '', code)  # Rimuove commenti su singola riga
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # Rimuove commenti multi-linea
    return code

def generate_random_name(length=8):
    """Genera un nome casuale per variabili e funzioni."""
    return ''.join(random.choices(string.ascii_letters, k=length))

def extract_identifiers(code):
    """Estrae nomi di variabili e funzioni dal codice C."""
    pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b')
    keywords = set(['int', 'char', 'float', 'double', 'return', 'if', 'else', 'while', 'for', 'do', 'switch', 'case', 'void'])
    identifiers = set(pattern.findall(code)) - keywords
    return identifiers

def categorize_identifiers(identifiers):
    """Classifica gli identificatori come variabili o funzioni."""
    var_counter, fun_counter = 1, 1
    replacements = {}
    for identifier in identifiers:
        if re.search(r'\b[A-Za-z_][A-Za-z0-9_]*\s*\(', identifier):
            replacements[identifier] = f'fun_{fun_counter}'
            fun_counter += 1
        else:
            replacements[identifier] = f'var_{var_counter}'
            var_counter += 1
    return replacements

def replace_identifiers(code, replacements):
    """Sostituisce i nomi di variabili e funzioni con nuovi nomi."""
    for old, new in replacements.items():
        code = re.sub(r'\b' + re.escape(old) + r'\b', new, code)
    return code

def process_c_file(filepath, new_filepath):
    """Legge un file C, sostituisce i nomi e salva il nuovo file."""
    with open(filepath, 'r', encoding='utf-8') as file:
        code = file.read()

    code_no_comments = remove_comments(code)
    identifiers = extract_identifiers(code_no_comments)
    replacements = categorize_identifiers(identifiers)
    new_code = replace_identifiers(code_no_comments, replacements)
    with open(new_filepath, 'w', encoding='utf-8') as file:
        file.write(new_code)

def process_directory(directory, new_directory_path):
    """Processa tutti i file C in una directory."""
    for filepath in Path(directory).glob("*.c"):
        new_filepath = os.path.join(new_directory_path, os.path.basename(filepath))
        process_c_file(filepath, new_filepath)

Execution:

In [23]:
import shutil

modified_vuln_path = os.path.join(vuln_path, "modified_files")

if os.path.exists(modified_vuln_path):
  shutil.rmtree(modified_vuln_path)
os.makedirs(modified_vuln_path, exist_ok=True)
process_directory(vuln_path, modified_vuln_path)


modified_non_vuln_path = os.path.join(non_vuln_path, "modified_files")

if os.path.exists(modified_non_vuln_path):
  shutil.rmtree(modified_non_vuln_path)
os.makedirs(modified_non_vuln_path, exist_ok=True)
process_directory(non_vuln_path, modified_non_vuln_path)

# **Tokenization**

Using CodeBERT tokenizer

In [24]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

# Load CodeBERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")


Preparing the Dataset

In [27]:
from datasets import Dataset, load_dataset, DatasetDict

def load_data_from_directory(directory, label):
    """Load C files from a directory and assign labels."""
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".c"):  # Ensure only C files are processed
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
                data.append({"text": f.read(), "label": label})
    return data

# Load data
vuln_data = load_data_from_directory(modified_vuln_path, 1)
non_vuln_data = load_data_from_directory(modified_non_vuln_path, 0)

# Combine datasets
dataset = vuln_data + non_vuln_data

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(dataset)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split dataset into train & test
split_dataset = tokenized_datasets.train_test_split(test_size=0.2)


Map:   0%|          | 0/622 [00:00<?, ? examples/s]