<a href="https://colab.research.google.com/github/emanuelebrizzi/bootcamp/blob/main/code_vulnerability_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
#from google.colab import files
#import zipfile
#import os

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# **Dataset upload**


In [None]:
import os
import glob

# Directory path
drive_path = "/content/drive/MyDrive/bootcamp_file"

libpng_path = os.path.join(drive_path, "LibPNG")

vuln_path = os.path.join(libpng_path, "Vulnerable_functions")
non_vuln_path = os.path.join(libpng_path, "Non_vulnerable_functions")

In [None]:

def load_files_from_folder(folder, label):
    files = glob.glob(os.path.join(folder, "*.c"))
    data = []
    for file in files:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            data.append((f.read(), label, os.path.basename(file)))
    return data


# **Replace Function names and Variable names**

In [None]:
import re
import random
import string
from pathlib import Path

In [None]:
def remove_comments(code):
    """Rimuove i commenti dal codice C."""
    code = re.sub(r'//.*', '', code)  # Rimuove commenti su singola riga
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # Rimuove commenti multi-linea
    return code

def generate_random_name(length=8):
    """Genera un nome casuale per variabili e funzioni."""
    return ''.join(random.choices(string.ascii_letters, k=length))

def extract_identifiers(code):
    """Estrae nomi di variabili e funzioni dal codice C."""
    pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b')
    keywords = set(['int', 'char', 'float', 'double', 'return', 'if', 'else', 'while', 'for', 'do', 'switch', 'case', 'void'])
    identifiers = set(pattern.findall(code)) - keywords
    return identifiers

def categorize_identifiers(identifiers):
    """Classifica gli identificatori come variabili o funzioni."""
    var_counter, fun_counter = 1, 1
    replacements = {}
    for identifier in identifiers:
        if re.search(r'\b[A-Za-z_][A-Za-z0-9_]*\s*\(', identifier):
            replacements[identifier] = f'fun_{fun_counter}'
            fun_counter += 1
        else:
            replacements[identifier] = f'var_{var_counter}'
            var_counter += 1
    return replacements

def replace_identifiers(code, replacements):
    """Sostituisce i nomi di variabili e funzioni con nuovi nomi."""
    for old, new in replacements.items():
        code = re.sub(r'\b' + re.escape(old) + r'\b', new, code)
    return code

def process_c_file(filepath, new_filepath):
    """Legge un file C, sostituisce i nomi e salva il nuovo file."""
    with open(filepath, 'r', encoding='utf-8') as file:
        code = file.read()

    code_no_comments = remove_comments(code)
    identifiers = extract_identifiers(code_no_comments)
    replacements = categorize_identifiers(identifiers)
    new_code = replace_identifiers(code_no_comments, replacements)
    with open(new_filepath, 'w', encoding='utf-8') as file:
        file.write(new_code)

def process_directory(directory, new_directory_path):
    """Processa tutti i file C in una directory."""
    for filepath in Path(directory).glob("*.c"):
        new_filepath = os.path.join(new_directory_path, os.path.basename(filepath))
        process_c_file(filepath, new_filepath)

Execution:

In [None]:
import shutil

modified_vuln_path = os.path.join(vuln_path, "modified_files")

if os.path.exists(modified_vuln_path):
  shutil.rmtree(modified_vuln_path)
os.makedirs(modified_vuln_path, exist_ok=True)
process_directory(vuln_path, modified_vuln_path)


modified_non_vuln_path = os.path.join(non_vuln_path, "modified_files")

if os.path.exists(modified_non_vuln_path):
  shutil.rmtree(modified_non_vuln_path)
os.makedirs(modified_non_vuln_path, exist_ok=True)
process_directory(non_vuln_path, modified_non_vuln_path)

# **Tokenization**

Using CodeBERT tokenizer

In [None]:
!pip install transformers datasets torch scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import Dataset, load_dataset, DatasetDict
import torch


# Load CodeBERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Preparing the Dataset

In [None]:
def load_data_from_directory(directory, label):
    """Load C files from a directory and assign labels."""
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".c"):  # Ensure only C files are processed
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
                data.append({"text": f.read(), "label": label})
    return data

# Load data
vuln_data = load_data_from_directory(modified_vuln_path, 1)
non_vuln_data = load_data_from_directory(modified_non_vuln_path, 0)

# Combine datasets
dataset = vuln_data + non_vuln_data

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(dataset)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split dataset into train & test
split_dataset = tokenized_datasets.train_test_split(test_size=0.2)


Map:   0%|          | 0/622 [00:00<?, ? examples/s]

In [13]:
from transformers import TrainingArguments, Trainer

# Load CodeBERT model for classification
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:


Abort: 

In [None]:
# Evaluate model
metrics = trainer.evaluate()
print(metrics)

# Save model & tokenizer
model.save_pretrained("fine_tuned_codebert")
tokenizer.save_pretrained("fine_tuned_codebert")
