In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling,AutoModelForSequenceClassification,DataCollatorWithPadding
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
df = pd.read_parquet('data/train-00000-of-00001.parquet')
dataset = Dataset.from_pandas(df)

dataset = dataset.rename_column('label', 'labels')
print(df.head())


                                                text  label
0  while i was busy rejuvenating this old beauty ...      0
1  while baking a cake, always consider the rebuf...      1
2  for(let sportsanalysis = () => { let commencem...      1
3  the former agent, cloistered in shadows, refus...      0
4  oh, look past the unavowed secrecy, let us, wi...      0


In [3]:
# df = df[df['label'] != 2]

# Optional: Verify
print(df['label'].value_counts())

label
0    23414
1    22586
2     4000
Name: count, dtype: int64


In [4]:
# Split into train/validation (before tokenization)
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [5]:
from transformers import DistilBertTokenizer, DistilBertModel


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [6]:
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [7]:
# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
idtolabel = {0: 'benign', 1: 'malicious', 2: 'unknown'}
labeltoid = {'benign': 0, 'malicious': 1, 'unknown': 2}
num_labels = len(idtolabel)
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    id2label=idtolabel,
    label2id=labeltoid
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Preprocess function (no padding here—let collator handle it)
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

# Tokenize the split datasets
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

# Data collator for classification (dynamic padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 45000/45000 [00:22<00:00, 2005.26 examples/s]
Map: 100%|██████████| 5000/5000 [00:02<00:00, 2056.65 examples/s]


In [9]:
# Metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    prec = precision.compute(predictions=predictions, references=labels, average='weighted')
    rec = recall.compute(predictions=predictions, references=labels, average='weighted')
    auc = auc_score.compute(prediction_scores=logits[:, 1], references=labels)
    return {
        'accuracy': acc['accuracy'],
        'precision': prec['precision'],
        'recall': rec['recall'],
        'roc_auc': auc['roc_auc']
    }

In [None]:
# Training args
lr = 5e-5
batch_size = 8
num_epochs = 3

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

  return torch._C._cuda_getDeviceCount() > 0


: 

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate