# php 

## imports

In [1]:
import os
import re
import glob
import torch
import numpy as np
import pandas as pd
import hashlib

from pathlib import Path
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, classification_report, confusion_matrix
)
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## Config

In [2]:
class Config:
    # Paths
    DATASET_BASE = "../dataset/php_cwe_079_samples"
    GOOD_DIR = os.path.join(DATASET_BASE, "good")  
    BAD_DIR = os.path.join(DATASET_BASE, "bad")    
    OUTPUT_DIR = "./php_cwe79_codebert_output"
    CACHE_DIR = "./cache"
    
    # Model
    MODEL_NAME = "microsoft/codebert-base"
    MAX_LENGTH = 512
    
    # Training
    BATCH_SIZE = 8  
    GRADIENT_ACCUMULATION_STEPS = 8
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 20
    WARMUP_RATIO = 0.1
    WEIGHT_DECAY = 0.01
    
    # Data splits
    TEST_SIZE = 0.15
    VAL_SIZE = 0.15  # Of remaining training data
    RANDOM_SEED = 42

config = Config()

# Create directories
os.makedirs(config.CACHE_DIR, exist_ok=True)
os.makedirs(config.OUTPUT_DIR, exist_ok=True)

## Data Loading

In [3]:
def clean_php_code(content: str, remove_comments: bool = True) -> str:
    # Remove HTML comment blocks (license header)
    if remove_comments:
        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
    
    # Remove the //flaw marker (label leakage prevention)
    content = re.sub(r'//flaw\s*\n?', '', content)
    
    # Clean up extra whitespace
    content = re.sub(r'\n\s*\n+', '\n\n', content.strip())
    
    return content

def normalize_code(code: str) -> str:
    """Normalize code to detect structural duplicates."""
    # 1. Variables: $name -> $VAR
    # Protect superglobals
    superglobals = ['$_GET', '$_POST', '$_COOKIE', '$_REQUEST', '$_SERVER', '$_FILES', '$GLOBALS']
    for i, sg in enumerate(superglobals):
        code = code.replace(sg, f"__SUPERGLOBAL_{i}__")
    
    # Replace other variables
    code = re.sub(r'\$[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*', '$VAR', code)
    
    # Restore superglobals
    for i, sg in enumerate(superglobals):
        code = code.replace(f"__SUPERGLOBAL_{i}__", sg)

    # 2. Strings: "..." or '...' -> "STRING"
    code = re.sub(r'''(".*?"|'.*?')''', '"STRING"', code)
    
    # 3. Numbers: 123 -> 0
    code = re.sub(r'\b\d+\b', '0', code)
    
    return code


In [4]:
def extract_php_block(content: str) -> str:
    match = re.search(r'<\?php(.*?)\?>', content, re.DOTALL)
    if match:
        return match.group(1).strip()
    return content

In [5]:
def load_dataset(good_dir: str, bad_dir: str, php_only: bool = True) -> pd.DataFrame:
    
    data = []
    
    # Load safe samples (from 'good' directory)
    good_files = glob.glob(os.path.join(good_dir, "*.php"))
    for filepath in tqdm(good_files, desc="Safe samples"):
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            cleaned = clean_php_code(content)
            if php_only:
                cleaned = extract_php_block(cleaned)
            
            if cleaned:
                data.append({
                    'filepath': filepath,
                    'code': cleaned,
                    'label': 0,
                    'label_name': 'safe'
                })
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
    
    # Load vulnerable samples (from 'bad' directory)
    bad_files = glob.glob(os.path.join(bad_dir, "*.php"))
    for filepath in tqdm(bad_files, desc="Vulnerable samples"):
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            cleaned = clean_php_code(content)
            if php_only:
                cleaned = extract_php_block(cleaned)
            
            if cleaned:
                data.append({
                    'filepath': filepath,
                    'code': cleaned,
                    'label': 1,
                    'label_name': 'vulnerable'
                })
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
    
    df = pd.DataFrame(data)
    # Calculate hashes
    df['code_hash'] = df['code'].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
    df['struct_hash'] = df['code'].apply(lambda x: hashlib.md5(normalize_code(x).encode('utf-8')).hexdigest())
    return df



## eda

In [25]:
# Load the NEW combined dataset (with safe patterns supplement)
cache_path = "./cache/php_balanced_realistic.pkl"

if not os.path.exists(cache_path):
    print(f"ERROR: Combined dataset not found at {cache_path}")
    print("Please run: python code/step1_combine_data.py")
    raise FileNotFoundError(f"Required dataset not found: {cache_path}")

print(f"Loading combined dataset from: {cache_path}")
df = pd.read_pickle(cache_path)

# Ensure 'label_name' column exists
if 'label_name' not in df.columns:
    df['label_name'] = df['label'].map({0: 'safe', 1: 'vulnerable'})

Loading combined dataset from: ./cache/php_balanced_realistic.pkl


In [26]:
print("\n" + "="*60)
print("DATASET STATISTICS")
print("="*60)
print(f"Total samples: {len(df):,}")
print(f"\nLabel distribution:")
print(df['label_name'].value_counts())
print(f"\nClass balance ratio: {df['label'].value_counts()[0] / df['label'].value_counts()[1]:.2f}")


DATASET STATISTICS
Total samples: 9,724

Label distribution:
label_name
safe          4862
vulnerable    4862
Name: count, dtype: int64

Class balance ratio: 1.00


In [28]:
df['code_length'] = df['code'].apply(len)
df['token_count_approx'] = df['code'].apply(lambda x: len(x.split()))

print("Code Length Statistics:")
print("-" * 40)
print(df['code_length'].describe())

print("\nApproximate Token Count Statistics:")
print("-" * 40)
print(df['token_count_approx'].describe())

# Show sample code from each class
print("\n" + "="*60)
print("SAMPLE SAFE CODE:")
print("="*60)
safe_sample = df[df['label'] == 0].iloc[0]['code']
print(safe_sample[:500] + "..." if len(safe_sample) > 500 else safe_sample)

print("\n" + "="*60)
print("SAMPLE VULNERABLE CODE:")
print("="*60)
vuln_sample = df[df['label'] == 1].iloc[0]['code']
print(vuln_sample[:500] + "..." if len(vuln_sample) > 500 else vuln_sample)

Code Length Statistics:
----------------------------------------
count    9724.000000
mean      199.005348
std       129.247928
min        21.000000
25%       102.000000
50%       162.000000
75%       258.000000
max       614.000000
Name: code_length, dtype: float64

Approximate Token Count Statistics:
----------------------------------------
count    9724.000000
mean       23.815508
std        14.563140
min         2.000000
25%        12.000000
50%        19.000000
75%        34.000000
max        69.000000
Name: token_count_approx, dtype: float64

SAMPLE SAFE CODE:
$tainted = `cat /tmp/tainted.txt`;
$tainted = (float) $tainted ;
echo "<div id='".  $tainted ."'>content</div>" ;

SAMPLE VULNERABLE CODE:
<?php
$input = $_GET['name'];
$clean = addslashes($input);
echo "<div>" . $clean . "</div>";
?>


## tokenizer

In [29]:
tokenizer = RobertaTokenizer.from_pretrained(config.MODEL_NAME)

In [30]:
sample_size = min(1000, len(df))
sample_df = df.sample(n=sample_size, random_state=config.RANDOM_SEED)

token_lengths = []
for code in tqdm(sample_df['code'], desc="Tokenizing samples"):
    tokens = tokenizer(code, truncation=False, add_special_tokens=True)
    token_lengths.append(len(tokens['input_ids']))

token_lengths = np.array(token_lengths)

Tokenizing samples:   0%|          | 0/1000 [00:00<?, ?it/s]

In [31]:
print("\nToken Length Statistics (sample of 1000):")
print("-" * 40)
print(f"Min: {token_lengths.min()}")
print(f"Max: {token_lengths.max()}")
print(f"Mean: {token_lengths.mean():.1f}")
print(f"Median: {np.median(token_lengths):.1f}")
print(f"95th percentile: {np.percentile(token_lengths, 95):.1f}")
print(f"99th percentile: {np.percentile(token_lengths, 99):.1f}")


Token Length Statistics (sample of 1000):
----------------------------------------
Min: 9
Max: 245
Mean: 77.8
Median: 64.0
95th percentile: 186.0
99th percentile: 214.0


In [32]:
truncation_rate = (token_lengths > config.MAX_LENGTH).mean() * 100
print(f"\nSamples exceeding {config.MAX_LENGTH} tokens: {truncation_rate:.1f}%")


Samples exceeding 512 tokens: 0.0%


## Dataset Class and Data Preparation

In [33]:
import random
import string

def augment_code(code):
    """Randomly rename variables to force model to learn logic."""
    # Find all variables like $tainted, $data, etc.
    variables = set(re.findall(r'\$[a-zA-Z_][a-zA-Z0-9_]*', code))
    
    # Don't rename superglobals
    superglobals = {'$_GET', '$_POST', '$_COOKIE', '$_REQUEST', '$_SERVER', '$_FILES', '$GLOBALS'}
    variables = variables - superglobals
    
    # Create random mapping
    mapping = {}
    for var in variables:
        # Generate random name like $v_x7z
        random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=4))
        mapping[var] = f"$v_{random_suffix}"
        
    # Apply mapping
    augmented_code = code
    for var, new_name in mapping.items():
        # Replace $var but not $var_2 (boundary check)
        augmented_code = re.sub(re.escape(var) + r'(?![a-zA-Z0-9_])', new_name, augmented_code)
        
    return augmented_code


In [34]:
class CWE79Dataset(Dataset):
    
    def __init__(self, codes, labels, tokenizer, max_length, augment=False):
        self.codes = codes
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
    
    def __len__(self):
        return len(self.codes)
    
    def __getitem__(self, idx):
        code = str(self.codes[idx])
        label = self.labels[idx]
        
        # Apply augmentation (50% chance if enabled)
        if self.augment and random.random() < 0.5:
            code = augment_code(code)
        
        encoding = self.tokenizer(
            code,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [35]:
# Split dataset

train_val_df, test_df = train_test_split(
    df, test_size=config.TEST_SIZE, random_state=config.RANDOM_SEED, stratify=df['label']
)
train_df, val_df = train_test_split(
    train_val_df, test_size=config.VAL_SIZE, random_state=config.RANDOM_SEED, stratify=train_val_df['label']
)

In [36]:
print(f"\nDataset splits:")
print(f"  Training:   {len(train_df):,} samples")
print(f"  Validation: {len(val_df):,} samples")
print(f"  Test:       {len(test_df):,} samples")



Dataset splits:
  Training:   7,025 samples
  Validation: 1,240 samples
  Test:       1,459 samples


In [37]:
# Create datasets
train_dataset = CWE79Dataset(train_df['code'].values, train_df['label'].values, tokenizer, config.MAX_LENGTH)
val_dataset = CWE79Dataset(val_df['code'].values, val_df['label'].values, tokenizer, config.MAX_LENGTH)
test_dataset = CWE79Dataset(test_df['code'].values, test_df['label'].values, tokenizer, config.MAX_LENGTH)

## Model Initialization

In [38]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [39]:
model = RobertaForSequenceClassification.from_pretrained(
    config.MODEL_NAME, num_labels=2, problem_type="single_label_classification"
)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
freeze_layers = 10

# Freeze embeddings
for param in model.roberta.embeddings.parameters():
    param.requires_grad = False

# Freeze first 10 transformer layers
for i, layer in enumerate(model.roberta.encoder.layer):
    if i < freeze_layers:
        for param in layer.parameters():
            param.requires_grad = False

# Check trainable parameters
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen = total - trainable

print(f"Total parameters:     {total:,}")
print(f"Frozen parameters:    {frozen:,}")
print(f"Trainable parameters: {trainable:,}")
print(f"Training {trainable/total*100:.1f}% of model")

Total parameters:     124,647,170
Frozen parameters:    109,879,296
Trainable parameters: 14,767,874
Training 11.8% of model


## Training Setup

In [42]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, average='binary'),
        'recall': recall_score(labels, predictions, average='binary'),
        'f1': f1_score(labels, predictions, average='binary'),
    }

# Handle class imbalance
class_counts = train_df['label'].value_counts().sort_index().values
class_weights = torch.tensor([len(train_df) / (2 * c) for c in class_counts], dtype=torch.float).to(device)
print(f"Class weights: {class_weights.tolist()}")

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fn(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.NUM_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE * 2,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
    learning_rate=config.LEARNING_RATE,
    warmup_ratio=config.WARMUP_RATIO,
    weight_decay=config.WEIGHT_DECAY,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir=f"{config.OUTPUT_DIR}/logs",
    logging_steps=50,
    report_to=["tensorboard"],
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    save_total_limit=3,
    seed=config.RANDOM_SEED,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Class weights: [1.0001423358917236, 0.9998576641082764]


## Training

In [43]:
class_counts = train_df['label'].value_counts().sort_index().values
class_weights = torch.tensor(
    [len(train_df) / (2 * c) for c in class_counts], dtype=torch.float
).to(device)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,  
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


train_result = trainer.train()

  0%|          | 0/2180 [00:00<?, ?it/s]

{'loss': 0.7019, 'learning_rate': 4.587155963302753e-06, 'epoch': 0.46}
{'loss': 0.6915, 'learning_rate': 9.174311926605506e-06, 'epoch': 0.91}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.6805028915405273, 'eval_accuracy': 0.5532258064516129, 'eval_precision': 0.5733333333333334, 'eval_recall': 0.4161290322580645, 'eval_f1': 0.4822429906542056, 'eval_runtime': 19.9477, 'eval_samples_per_second': 62.162, 'eval_steps_per_second': 3.91, 'epoch': 0.99}
{'loss': 0.6774, 'learning_rate': 1.3761467889908258e-05, 'epoch': 1.37}
{'loss': 0.5408, 'learning_rate': 1.834862385321101e-05, 'epoch': 1.82}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.35471096634864807, 'eval_accuracy': 0.8362903225806452, 'eval_precision': 0.8183206106870229, 'eval_recall': 0.864516129032258, 'eval_f1': 0.8407843137254902, 'eval_runtime': 20.4696, 'eval_samples_per_second': 60.578, 'eval_steps_per_second': 3.811, 'epoch': 1.99}
{'loss': 0.3461, 'learning_rate': 1.9673802242609582e-05, 'epoch': 2.28}
{'loss': 0.2515, 'learning_rate': 1.9164118246687054e-05, 'epoch': 2.73}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.20694807171821594, 'eval_accuracy': 0.9185483870967742, 'eval_precision': 0.9625668449197861, 'eval_recall': 0.8709677419354839, 'eval_f1': 0.9144792548687553, 'eval_runtime': 31.9108, 'eval_samples_per_second': 38.858, 'eval_steps_per_second': 2.444, 'epoch': 2.99}
{'loss': 0.211, 'learning_rate': 1.865443425076453e-05, 'epoch': 3.19}
{'loss': 0.1931, 'learning_rate': 1.8144750254841998e-05, 'epoch': 3.64}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.16265422105789185, 'eval_accuracy': 0.9314516129032258, 'eval_precision': 0.9768270944741533, 'eval_recall': 0.8838709677419355, 'eval_f1': 0.9280270956816258, 'eval_runtime': 36.5458, 'eval_samples_per_second': 33.93, 'eval_steps_per_second': 2.134, 'epoch': 4.0}
{'loss': 0.1447, 'learning_rate': 1.763506625891947e-05, 'epoch': 4.1}
{'loss': 0.1335, 'learning_rate': 1.7125382262996945e-05, 'epoch': 4.55}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.09464450925588608, 'eval_accuracy': 0.9612903225806452, 'eval_precision': 0.971947194719472, 'eval_recall': 0.95, 'eval_f1': 0.9608482871125612, 'eval_runtime': 35.2367, 'eval_samples_per_second': 35.191, 'eval_steps_per_second': 2.214, 'epoch': 5.0}
{'loss': 0.1333, 'learning_rate': 1.6625891946992865e-05, 'epoch': 5.01}
{'loss': 0.1043, 'learning_rate': 1.6116207951070337e-05, 'epoch': 5.46}
{'loss': 0.1057, 'learning_rate': 1.560652395514781e-05, 'epoch': 5.92}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.09458089619874954, 'eval_accuracy': 0.9548387096774194, 'eval_precision': 0.9763513513513513, 'eval_recall': 0.932258064516129, 'eval_f1': 0.9537953795379538, 'eval_runtime': 36.0845, 'eval_samples_per_second': 34.364, 'eval_steps_per_second': 2.162, 'epoch': 6.0}
{'loss': 0.1046, 'learning_rate': 1.5096839959225283e-05, 'epoch': 6.37}
{'loss': 0.0887, 'learning_rate': 1.4587155963302753e-05, 'epoch': 6.83}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.07421040534973145, 'eval_accuracy': 0.9701612903225807, 'eval_precision': 0.9755301794453507, 'eval_recall': 0.964516129032258, 'eval_f1': 0.9699918896999189, 'eval_runtime': 37.4354, 'eval_samples_per_second': 33.124, 'eval_steps_per_second': 2.084, 'epoch': 7.0}
{'loss': 0.0763, 'learning_rate': 1.4077471967380225e-05, 'epoch': 7.28}
{'loss': 0.0916, 'learning_rate': 1.3567787971457698e-05, 'epoch': 7.74}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.05636864900588989, 'eval_accuracy': 0.9733870967741935, 'eval_precision': 0.9726247987117552, 'eval_recall': 0.9741935483870968, 'eval_f1': 0.9734085414987913, 'eval_runtime': 37.5471, 'eval_samples_per_second': 33.025, 'eval_steps_per_second': 2.077, 'epoch': 8.0}
{'loss': 0.0858, 'learning_rate': 1.305810397553517e-05, 'epoch': 8.19}
{'loss': 0.0716, 'learning_rate': 1.254841997961264e-05, 'epoch': 8.65}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.049785859882831573, 'eval_accuracy': 0.9790322580645161, 'eval_precision': 0.9759615384615384, 'eval_recall': 0.9822580645161291, 'eval_f1': 0.9790996784565916, 'eval_runtime': 39.3971, 'eval_samples_per_second': 31.474, 'eval_steps_per_second': 1.98, 'epoch': 8.99}
{'loss': 0.0686, 'learning_rate': 1.2038735983690114e-05, 'epoch': 9.1}
{'loss': 0.0673, 'learning_rate': 1.1529051987767585e-05, 'epoch': 9.56}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.046214405447244644, 'eval_accuracy': 0.9798387096774194, 'eval_precision': 0.9744816586921851, 'eval_recall': 0.9854838709677419, 'eval_f1': 0.9799518845228549, 'eval_runtime': 29.0256, 'eval_samples_per_second': 42.721, 'eval_steps_per_second': 2.687, 'epoch': 9.99}
{'loss': 0.0577, 'learning_rate': 1.1019367991845057e-05, 'epoch': 10.01}
{'loss': 0.0533, 'learning_rate': 1.0509683995922529e-05, 'epoch': 10.47}
{'loss': 0.0569, 'learning_rate': 1e-05, 'epoch': 10.92}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.039788804948329926, 'eval_accuracy': 0.9846774193548387, 'eval_precision': 0.9823434991974318, 'eval_recall': 0.9870967741935484, 'eval_f1': 0.9847144006436042, 'eval_runtime': 29.3619, 'eval_samples_per_second': 42.232, 'eval_steps_per_second': 2.657, 'epoch': 10.99}
{'loss': 0.0497, 'learning_rate': 9.490316004077473e-06, 'epoch': 11.38}
{'loss': 0.0532, 'learning_rate': 8.980632008154944e-06, 'epoch': 11.83}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.034001607447862625, 'eval_accuracy': 0.9854838709677419, 'eval_precision': 0.9870550161812298, 'eval_recall': 0.9838709677419355, 'eval_f1': 0.9854604200323102, 'eval_runtime': 35.9784, 'eval_samples_per_second': 34.465, 'eval_steps_per_second': 2.168, 'epoch': 12.0}
{'loss': 0.0517, 'learning_rate': 8.470948012232416e-06, 'epoch': 12.29}
{'loss': 0.0382, 'learning_rate': 7.961264016309888e-06, 'epoch': 12.74}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.02894522435963154, 'eval_accuracy': 0.9854838709677419, 'eval_precision': 0.9839228295819936, 'eval_recall': 0.9870967741935484, 'eval_f1': 0.9855072463768116, 'eval_runtime': 37.3704, 'eval_samples_per_second': 33.181, 'eval_steps_per_second': 2.087, 'epoch': 13.0}
{'loss': 0.0333, 'learning_rate': 7.45158002038736e-06, 'epoch': 13.2}
{'loss': 0.0338, 'learning_rate': 6.941896024464833e-06, 'epoch': 13.65}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.028582246974110603, 'eval_accuracy': 0.9854838709677419, 'eval_precision': 0.9854838709677419, 'eval_recall': 0.9854838709677419, 'eval_f1': 0.9854838709677419, 'eval_runtime': 34.3893, 'eval_samples_per_second': 36.058, 'eval_steps_per_second': 2.268, 'epoch': 14.0}
{'loss': 0.0365, 'learning_rate': 6.432212028542304e-06, 'epoch': 14.11}
{'loss': 0.0332, 'learning_rate': 5.922528032619776e-06, 'epoch': 14.56}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.02915828675031662, 'eval_accuracy': 0.9870967741935484, 'eval_precision': 0.9870967741935484, 'eval_recall': 0.9870967741935484, 'eval_f1': 0.9870967741935484, 'eval_runtime': 24.73, 'eval_samples_per_second': 50.141, 'eval_steps_per_second': 3.154, 'epoch': 15.0}
{'loss': 0.0329, 'learning_rate': 5.412844036697248e-06, 'epoch': 15.02}
{'loss': 0.029, 'learning_rate': 4.90316004077472e-06, 'epoch': 15.47}
{'loss': 0.0271, 'learning_rate': 4.393476044852192e-06, 'epoch': 15.93}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.02873540110886097, 'eval_accuracy': 0.9854838709677419, 'eval_precision': 0.9839228295819936, 'eval_recall': 0.9870967741935484, 'eval_f1': 0.9855072463768116, 'eval_runtime': 24.9886, 'eval_samples_per_second': 49.623, 'eval_steps_per_second': 3.121, 'epoch': 16.0}
{'loss': 0.0233, 'learning_rate': 3.8837920489296635e-06, 'epoch': 16.38}
{'loss': 0.0264, 'learning_rate': 3.3741080530071357e-06, 'epoch': 16.84}


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 0.026798125356435776, 'eval_accuracy': 0.9854838709677419, 'eval_precision': 0.9854838709677419, 'eval_recall': 0.9854838709677419, 'eval_f1': 0.9854838709677419, 'eval_runtime': 35.295, 'eval_samples_per_second': 35.132, 'eval_steps_per_second': 2.21, 'epoch': 16.99}
{'train_runtime': 4937.5328, 'train_samples_per_second': 28.456, 'train_steps_per_second': 0.442, 'train_loss': 0.14822749438359895, 'epoch': 16.99}


In [23]:
# Evaluate on test set
print("Evaluating on test set...")

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = test_df['label'].values

print(f"\nAccuracy:  {accuracy_score(labels, preds):.4f}")
print(f"Precision: {precision_score(labels, preds):.4f}")
print(f"Recall:    {recall_score(labels, preds):.4f}")
print(f"F1-Score:  {f1_score(labels, preds):.4f}")

print("\nClassification Report:")
print(classification_report(labels, preds, target_names=['Safe', 'Vulnerable']))

print("\nConfusion Matrix:")
cm = confusion_matrix(labels, preds)
print(cm)

Evaluating on test set...


  0%|          | 0/104 [00:00<?, ?it/s]


Accuracy:  0.9868
Precision: 0.9862
Recall:    0.9835
F1-Score:  0.9848

Classification Report:
              precision    recall  f1-score   support

        Safe       0.99      0.99      0.99       936
  Vulnerable       0.99      0.98      0.98       726

    accuracy                           0.99      1662
   macro avg       0.99      0.99      0.99      1662
weighted avg       0.99      0.99      0.99      1662


Confusion Matrix:
[[926  10]
 [ 12 714]]


In [24]:
# Already saved at:
print(f"Model saved at: {config.OUTPUT_DIR}/best_model")

# To load later:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained("./cwe79_codebert_output/best_model")
tokenizer = RobertaTokenizer.from_pretrained("./cwe79_codebert_output/best_model")

Model saved at: ./php_cwe79_codebert_output/best_model


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './cwe79_codebert_output/best_model'. Use `repo_type` argument if needed.

# javascript

## imports

In [1]:
import os
import re
import glob
import torch
import random
import hashlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## configuration

In [2]:
class Config:
    BASE_DIR = "d:/ai_codebert_fp"
    SAFE_DIR = os.path.join(BASE_DIR, "dataset", "javascript", "CWE_79", "safe")
    UNSAFE_DIR = os.path.join(BASE_DIR, "dataset", "javascript", "CWE_79", "unsafe")
    CACHE_DIR = os.path.join(BASE_DIR, "dataset", "cache")
    OUTPUT_DIR = "./js_xss_codebert_output"
    MODEL_NAME = "microsoft/codebert-base"
    MAX_LENGTH = 512
    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    EPOCHS = 5
    SEED = 42
    TEST_SIZE = 0.15
    VAL_SIZE = 0.15

config = Config()
os.makedirs(config.CACHE_DIR, exist_ok=True)
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
random.seed(config.SEED)

## data cleaning


In [3]:
def clean_js_code(content: str) -> str:
    
    # 1. Remove the header comment block (contains "Safe sample" / "Unsafe sample")
    content = re.sub(r'/\*\s*(Safe|Unsafe)\s+sample.*?\*/', '', content, flags=re.DOTALL | re.IGNORECASE)
    
    # 2. Remove MIT License block
    content = re.sub(r'/\*\s*MIT License.*?\*/', '', content, flags=re.DOTALL)
    
    # 3. Remove //flaw marker (CRITICAL - this is a direct label leak!)
    content = re.sub(r'//\s*flaw\s*\n?', '', content)
    
    # 4. Remove //no_sanitizing marker
    content = re.sub(r'//\s*no_sanitizing\s*\n?', '', content)
    
    # 5. Remove any remaining multi-line comments that might contain hints
    # But keep inline comments that might be part of real code
    content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
    
    # 6. Clean up excessive whitespace
    content = re.sub(r'\n\s*\n+', '\n\n', content.strip())
    
    return content.strip()

In [4]:
test_code = """/* 
Safe sample
input : reads the field UserData from the variable $_GET
sanitize : cast in float
*/
/*
MIT License
Copyright (c) 2021 MAUREL Héloïse
...
*/
var x = 1;
//flaw
//no_sanitizing
var y = 2;
"""
print("Before cleaning:")
print(test_code[:100] + "...")
print("\nAfter cleaning:")
print(clean_js_code(test_code))

Before cleaning:
/* 
Safe sample
input : reads the field UserData from the variable $_GET
sanitize : cast in float
*/...

After cleaning:
var x = 1;
var y = 2;


## Normalize Code for Deduplication

In [5]:
def normalize_code(code: str) -> str:
    """Normalize code to detect structural duplicates."""
    # Remove comments
    code = re.sub(r'//.*?\n', '\n', code)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    # Normalize whitespace
    code = re.sub(r'\s+', ' ', code)
    # Normalize variable names
    code = re.sub(r'\b(var|let|const)\s+[a-zA-Z_$][a-zA-Z0-9_$]*', r'\1 VAR', code)
    return code.strip()

## data loading

## Data Loading Functions

In [6]:
def load_dataset(safe_dir, unsafe_dir):
    """Load JavaScript files with proper preprocessing to prevent label leakage."""
    data = []
    
    # Load safe samples
    for filepath in tqdm(glob.glob(os.path.join(safe_dir, "*.js")), desc="Loading safe"):
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                raw_code = f.read()
                # CRITICAL: Clean the code to remove label leaks!
                cleaned_code = clean_js_code(raw_code)
                if cleaned_code and len(cleaned_code) > 20:  # Skip if too short after cleaning
                    data.append({'code': cleaned_code, 'label': 0, 'source': 'original'})
        except Exception as e:
            pass
    
    # Load unsafe samples
    for filepath in tqdm(glob.glob(os.path.join(unsafe_dir, "*.js")), desc="Loading unsafe"):
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                raw_code = f.read()
                # CRITICAL: Clean the code to remove label leaks!
                cleaned_code = clean_js_code(raw_code)
                if cleaned_code and len(cleaned_code) > 20:
                    data.append({'code': cleaned_code, 'label': 1, 'source': 'original'})
        except Exception as e:
            pass
    
    df = pd.DataFrame(data)
    df['struct_hash'] = df['code'].apply(lambda x: hashlib.md5(normalize_code(x).encode()).hexdigest())
    return df


cache_path = "./cache/js_training_data_v3.pkl"

if not os.path.exists(cache_path):
    print(f"ERROR: JS dataset not found at {cache_path}")
    print("Please run: python code/step1_combine_js_data.py")
    raise FileNotFoundError(f"Required dataset not found: {cache_path}")

print(f"Loading JS dataset from: {cache_path}")
df = pd.read_pickle(cache_path)

if 'label_name' not in df.columns:
    df['label_name'] = df['label'].map({0: 'safe', 1: 'vulnerable'})

print(f"\nDataset: {len(df)} | Safe: {len(df[df['label']==0])} | Unsafe: {len(df[df['label']==1])}")


Loading JS dataset from: ./cache/js_training_data_v3.pkl

Dataset: 14031 | Safe: 6957 | Unsafe: 7074


## Train/Val/Test

In [7]:
print(f"Dataset:")
print(f"  Total: {len(df)}")
print(f"  Safe: {len(df[df['label']==0])}")
print(f"  Vulnerable: {len(df[df['label']==1])}")
print(f"\nBy source:")
print(df['source'].value_counts())

# Proper 3-way split
train_val_df, test_df = train_test_split(df, test_size=config.TEST_SIZE, random_state=config.SEED, stratify=df['label'])
train_df, val_df = train_test_split(train_val_df, test_size=config.VAL_SIZE, random_state=config.SEED, stratify=train_val_df['label'])

print(f"\nSplits:")
print(f"  Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

Dataset:
  Total: 14031
  Safe: 6957
  Vulnerable: 7074

By source:
source
original_cwe79         10920
safe_dompurify           221
vuln_dom_xss             200
safe_react               200
safe_dom                 199
vuln_dom_innerhtml       192
safe_textcontent         150
safe_encoding            144
vuln_express_xss         140
vuln_express_query       138
safe_validation          127
vuln_express_body        122
vuln_eval_xss            120
safe_html_escape         118
vuln_url_xss             116
vuln_eval                100
vuln_ajax_xss             99
vuln_attr_xss             99
vuln_location             83
vuln_express_full         79
safe_url_encode           75
vuln_express_params       74
vuln_react_xss            74
vuln_react                71
vuln_template_xss         66
safe_template             55
safe_json                 40
vuln_realistic             9
Name: count, dtype: int64

Splits:
  Train: 10137 | Val: 1789 | Test: 2105


In [8]:
train_val_df, test_df = train_test_split(df, test_size=config.TEST_SIZE, random_state=config.SEED, stratify=df['label'])
train_df, val_df = train_test_split(train_val_df, test_size=config.VAL_SIZE, random_state=config.SEED, stratify=train_val_df['label'])

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

Train: 10137 | Val: 1789 | Test: 2105


## Dataset Class and Tokenizer

In [9]:
tokenizer = RobertaTokenizer.from_pretrained(config.MODEL_NAME)

class XSSDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        code = str(self.data.loc[idx, 'code'])
        label = int(self.data.loc[idx, 'label'])
        enc = self.tokenizer(code, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return {'input_ids': enc['input_ids'].squeeze(), 'attention_mask': enc['attention_mask'].squeeze(), 'labels': torch.tensor(label)}

train_dataset = XSSDataset(train_df, tokenizer, config.MAX_LENGTH)
val_dataset = XSSDataset(val_df, tokenizer, config.MAX_LENGTH)
test_dataset = XSSDataset(test_df, tokenizer, config.MAX_LENGTH)

print(f"Datasets: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")

Datasets: Train=10137, Val=1789, Test=2105


## Model with Layer Freezing

In [10]:
model = RobertaForSequenceClassification.from_pretrained(config.MODEL_NAME, num_labels=2)

# Freeze all except last 2 layers + classifier
for param in model.roberta.embeddings.parameters():
    param.requires_grad = False
for layer in model.roberta.encoder.layer[:-2]:
    for param in layer.parameters():
        param.requires_grad = False

model = model.to(device)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({trainable/total*100:.1f}%)")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable: 14,767,874 / 124,647,170 (11.8%)


## Weighted Trainer for class imbalance

In [11]:
class_counts = train_df['label'].value_counts().sort_index().values
class_weights = torch.tensor([len(train_df) / (2 * c) for c in class_counts], dtype=torch.float).to(device)
print(f"Class weights: {class_weights.tolist()}")

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = torch.nn.CrossEntropyLoss(weight=self.class_weights)(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    return {'accuracy': accuracy_score(labels, preds), 'precision': precision_score(labels, preds), 
            'recall': recall_score(labels, preds), 'f1': f1_score(labels, preds)}

Class weights: [1.0084559917449951, 0.99168461561203]


## Training Arguments

In [12]:
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE * 2,
    gradient_accumulation_steps=2,
    learning_rate=config.LEARNING_RATE,
    warmup_ratio=0.1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    save_total_limit=3,
    seed=config.SEED,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Train

In [13]:
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Starting training...")
trainer.train()

# Save model and tokenizer
trainer.save_model(f"{config.OUTPUT_DIR}/best_model")
tokenizer.save_pretrained(f"{config.OUTPUT_DIR}/best_model")
print(f"Model saved to {config.OUTPUT_DIR}/best_model")

Starting training...


  0%|          | 0/1585 [00:00<?, ?it/s]

{'loss': 0.6953, 'learning_rate': 6.289308176100629e-06, 'epoch': 0.16}
{'loss': 0.6968, 'learning_rate': 1.2578616352201259e-05, 'epoch': 0.32}
{'loss': 0.6812, 'learning_rate': 1.8867924528301888e-05, 'epoch': 0.47}
{'loss': 0.4466, 'learning_rate': 1.9424964936886398e-05, 'epoch': 0.63}
{'loss': 0.3033, 'learning_rate': 1.8723702664796635e-05, 'epoch': 0.79}
{'loss': 0.2881, 'learning_rate': 1.8022440392706875e-05, 'epoch': 0.95}


  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.3021243214607239, 'eval_accuracy': 0.8714365567356065, 'eval_precision': 0.9772727272727273, 'eval_recall': 0.7627494456762749, 'eval_f1': 0.8567870485678705, 'eval_runtime': 48.3044, 'eval_samples_per_second': 37.036, 'eval_steps_per_second': 1.159, 'epoch': 1.0}
{'loss': 0.2565, 'learning_rate': 1.732117812061711e-05, 'epoch': 1.1}
{'loss': 0.24, 'learning_rate': 1.661991584852735e-05, 'epoch': 1.26}
{'loss': 0.212, 'learning_rate': 1.5918653576437588e-05, 'epoch': 1.42}
{'loss': 0.1922, 'learning_rate': 1.5231416549789621e-05, 'epoch': 1.58}
{'loss': 0.193, 'learning_rate': 1.4530154277699861e-05, 'epoch': 1.74}
{'loss': 0.1525, 'learning_rate': 1.38288920056101e-05, 'epoch': 1.89}


  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.12390203028917313, 'eval_accuracy': 0.9530463946338736, 'eval_precision': 0.9845971563981043, 'eval_recall': 0.9212860310421286, 'eval_f1': 0.9518900343642611, 'eval_runtime': 31.6476, 'eval_samples_per_second': 56.529, 'eval_steps_per_second': 1.769, 'epoch': 2.0}
{'loss': 0.14, 'learning_rate': 1.3127629733520338e-05, 'epoch': 2.05}
{'loss': 0.1464, 'learning_rate': 1.2426367461430575e-05, 'epoch': 2.21}
{'loss': 0.1499, 'learning_rate': 1.1725105189340815e-05, 'epoch': 2.37}
{'loss': 0.1212, 'learning_rate': 1.1023842917251053e-05, 'epoch': 2.52}
{'loss': 0.1438, 'learning_rate': 1.0322580645161291e-05, 'epoch': 2.68}
{'loss': 0.1253, 'learning_rate': 9.62131837307153e-06, 'epoch': 2.84}
{'loss': 0.1444, 'learning_rate': 8.920056100981768e-06, 'epoch': 3.0}


  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.09398985654115677, 'eval_accuracy': 0.9591950810508664, 'eval_precision': 0.9726339794754846, 'eval_recall': 0.9456762749445676, 'eval_f1': 0.9589657110736369, 'eval_runtime': 32.1697, 'eval_samples_per_second': 55.611, 'eval_steps_per_second': 1.741, 'epoch': 3.0}
{'loss': 0.1212, 'learning_rate': 8.218793828892006e-06, 'epoch': 3.15}
{'loss': 0.1439, 'learning_rate': 7.5175315568022445e-06, 'epoch': 3.31}
{'loss': 0.106, 'learning_rate': 6.816269284712484e-06, 'epoch': 3.47}
{'loss': 0.1041, 'learning_rate': 6.115007012622721e-06, 'epoch': 3.63}
{'loss': 0.1233, 'learning_rate': 5.41374474053296e-06, 'epoch': 3.79}
{'loss': 0.119, 'learning_rate': 4.712482468443198e-06, 'epoch': 3.94}


  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.09985590726137161, 'eval_accuracy': 0.9580771380659586, 'eval_precision': 0.9858989424206815, 'eval_recall': 0.9301552106430155, 'eval_f1': 0.9572162007986309, 'eval_runtime': 32.5863, 'eval_samples_per_second': 54.9, 'eval_steps_per_second': 1.719, 'epoch': 4.0}
{'loss': 0.1182, 'learning_rate': 4.011220196353436e-06, 'epoch': 4.1}
{'loss': 0.1245, 'learning_rate': 3.309957924263675e-06, 'epoch': 4.26}
{'loss': 0.1099, 'learning_rate': 2.6086956521739132e-06, 'epoch': 4.42}
{'loss': 0.1029, 'learning_rate': 1.9074333800841516e-06, 'epoch': 4.57}
{'loss': 0.1209, 'learning_rate': 1.2061711079943899e-06, 'epoch': 4.73}
{'loss': 0.111, 'learning_rate': 5.049088359046283e-07, 'epoch': 4.89}


  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.08771800994873047, 'eval_accuracy': 0.961430967020682, 'eval_precision': 0.983739837398374, 'eval_recall': 0.9390243902439024, 'eval_f1': 0.9608621667612025, 'eval_runtime': 32.6596, 'eval_samples_per_second': 54.777, 'eval_steps_per_second': 1.715, 'epoch': 5.0}
{'train_runtime': 1486.5615, 'train_samples_per_second': 34.095, 'train_steps_per_second': 1.066, 'train_loss': 0.21472310433252376, 'epoch': 5.0}
Model saved to ./js_xss_codebert_output/best_model


In [14]:
print("Evaluating on TEST set...")
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = test_df['label'].values

print(f"\nAccuracy:  {accuracy_score(labels, preds):.4f}")
print(f"Precision: {precision_score(labels, preds):.4f}")
print(f"Recall:    {recall_score(labels, preds):.4f}")
print(f"F1-Score:  {f1_score(labels, preds):.4f}")
print("\nClassification Report:")
print(classification_report(labels, preds, target_names=['Safe', 'Vulnerable']))
print("\nConfusion Matrix:")
print(confusion_matrix(labels, preds))

Evaluating on TEST set...


  0%|          | 0/66 [00:00<?, ?it/s]


Accuracy:  0.9468
Precision: 0.9731
Recall:    0.9199
F1-Score:  0.9457

Classification Report:
              precision    recall  f1-score   support

        Safe       0.92      0.97      0.95      1044
  Vulnerable       0.97      0.92      0.95      1061

    accuracy                           0.95      2105
   macro avg       0.95      0.95      0.95      2105
weighted avg       0.95      0.95      0.95      2105


Confusion Matrix:
[[1017   27]
 [  85  976]]
