**Install libraries**

In [1]:
%%capture
!pip install transformers datasets

**Import libraries**

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import torch

**Import dataset**

In [16]:
os.environ["WANDB_DISABLED"] = "true"

df = pd.read_csv("/content/balanced_urls.csv")
df = df.dropna()

benign_count = df[df['result'] == 0].shape[0]
malicious_count = df[df['result'] == 1].shape[0]
print(f"Numero di url sicure: {benign_count}")
print(f"Numero di url non sicure: {malicious_count}")

Numero di url sicure: 316254
Numero di url non sicure: 316254


**Test & training set**

In [17]:
sample_size = 2000

df_benign = df[df['result'] == 0].sample(n=sample_size // 2, random_state=42)
df_malicious = df[df['result'] == 1].sample(n=sample_size // 2, random_state=42)
df_sampled = pd.concat([df_benign, df_malicious]).sample(frac=1, random_state=42)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_sampled['url'], df_sampled['result'], test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
test_dataset = Dataset.from_dict({'text': test_texts.tolist(), 'label': test_labels.tolist()})
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

**Tokenizer**

In [18]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

**Training model**

In [19]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.214302,0.97,1.0,0.940887,0.969543
2,No log,0.035261,0.995,0.995074,0.995074,0.995074
3,0.092000,0.031781,0.995,0.995074,0.995074,0.995074
4,0.092000,0.050786,0.99,1.0,0.980296,0.99005
5,0.038800,0.018264,0.9975,1.0,0.995074,0.997531
6,0.038800,0.01689,0.9975,1.0,0.995074,0.997531
7,0.038800,0.016754,0.9975,1.0,0.995074,0.997531
8,0.019600,0.02093,0.9975,1.0,0.995074,0.997531
9,0.019600,0.021938,0.9975,1.0,0.995074,0.997531
10,0.001800,0.021828,0.9975,1.0,0.995074,0.997531


TrainOutput(global_step=2000, training_loss=0.03806581813097, metrics={'train_runtime': 1772.8674, 'train_samples_per_second': 9.025, 'train_steps_per_second': 1.128, 'total_flos': 4209776885760000.0, 'train_loss': 0.03806581813097, 'epoch': 10.0})

**Evaluating model**

In [20]:
results = trainer.evaluate()
print("Valutazione del modello:", results)

Valutazione del modello: {'eval_loss': 0.02182803861796856, 'eval_accuracy': 0.9975, 'eval_precision': 1.0, 'eval_recall': 0.9950738916256158, 'eval_f1': 0.9975308641975309, 'eval_runtime': 12.2543, 'eval_samples_per_second': 32.642, 'eval_steps_per_second': 4.08, 'epoch': 10.0}
