In [1]:
import transformers
import torch
import numpy as np
import pandas as pd
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import load_dataset, load_metric
from transformers import Trainer,TrainingArguments,AutoTokenizer,AutoModelForSequenceClassification,DataCollatorForTokenClassification,EarlyStoppingCallback
from datasets import Dataset
import time

In [2]:
model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
batch_size = 32
wd = 0.01
lr = 2e-5
# epochs = 3
epochs = 1

task = "binary_classification"
label_list = [0,1]
num_labels = 2

In [3]:
def seed_everything(seed:42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
# fix seed
seed_everything(42)

In [4]:
df = pd.read_csv('PAYLOAD DATASET !!!!!')
df.head()

Unnamed: 0,payload,label
0,OPTIONS sip:100@27.101.32.15 SIP/2.0Via: SIP/2...,O
1,POST /statistics?clienttype=8&devuid=BDIMXV2%2...,O
2,GET /ad/p/in/v1_0/imp?slotid=stw_incruit11_134...,X
3,d1:rd2:id20:\n..xC...kw....).!e1:t2:$1:y1:re,O
4,d1:rd2:id20: -^.A.8 '.g%c.U$P2:ip4:e.e1:t2:31:...,O


In [5]:
df.isna().sum()

payload    23080
label          0
dtype: int64

In [6]:
df['payload'] = df['payload'].fillna('')

In [7]:
df['label'] = np.where(df['label'] == 'O', 1, 0)
df['label'].value_counts()

1    147613
0     52387
Name: label, dtype: int64

In [9]:
df = df.sample(20000)
df.shape

(20000, 2)

In [10]:
train_data, val_data = train_test_split(df, random_state=42,test_size=.2)

In [11]:
df_train = pd.DataFrame({"payload":train_data["payload"],'label':train_data["label"]})
dataset_train = Dataset.from_pandas(df_train)

In [12]:
df_val = pd.DataFrame({"payload": val_data["payload"],'label':val_data["label"]})
dataset_val = Dataset.from_pandas(df_val)

In [13]:
try:
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
except:
    time.sleep(5)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [15]:
def tokenize_function(examples):
    return tokenizer(examples["payload"], padding="max_length", max_length = 50,truncation=True)

In [16]:
tokenized_train_datasets = dataset_train.map(tokenize_function, batched=True)
tokenized_val_datasets = dataset_val.map(tokenize_function, batched=True)

100%|██████████| 16/16 [00:01<00:00, 13.14ba/s]
100%|██████████| 4/4 [00:00<00:00, 10.20ba/s]


In [17]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,#2e-5
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,load_best_model_at_end=True,)

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],   
)

In [20]:
# 자동 GPU 또는 MPS 학습 시작
trainer.train()
# trainer.train(accelerator='mps')

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 500
100%|██████████| 500/500 [06:14<00:00,  1.32it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 32


{'loss': 0.0948, 'learning_rate': 0.0, 'epoch': 1.0}


                                                 
100%|██████████| 500/500 [06:44<00:00,  1.32it/s]Saving model checkpoint to test-binary_classification/checkpoint-500
Configuration saved in test-binary_classification/checkpoint-500/config.json


{'eval_loss': 0.0395999476313591, 'eval_accuracy': 0.988, 'eval_f1': 0.9844365596885496, 'eval_precision': 0.9890613298699666, 'eval_recall': 0.9800394263897224, 'eval_runtime': 29.8561, 'eval_samples_per_second': 133.976, 'eval_steps_per_second': 4.187, 'epoch': 1.0}


Model weights saved in test-binary_classification/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-binary_classification/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-binary_classification/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from test-binary_classification/checkpoint-500 (score: 0.0395999476313591).
100%|██████████| 500/500 [06:45<00:00,  1.23it/s]

{'train_runtime': 405.807, 'train_samples_per_second': 39.428, 'train_steps_per_second': 1.232, 'train_loss': 0.09478414916992188, 'epoch': 1.0}





TrainOutput(global_step=500, training_loss=0.09478414916992188, metrics={'train_runtime': 405.807, 'train_samples_per_second': 39.428, 'train_steps_per_second': 1.232, 'train_loss': 0.09478414916992188, 'epoch': 1.0})

In [21]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: payload, __index_level_0__. If payload, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 32
100%|██████████| 125/125 [00:29<00:00,  4.20it/s]


{'eval_loss': 0.0395999476313591,
 'eval_accuracy': 0.988,
 'eval_f1': 0.9844365596885496,
 'eval_precision': 0.9890613298699666,
 'eval_recall': 0.9800394263897224,
 'eval_runtime': 30.0723,
 'eval_samples_per_second': 133.013,
 'eval_steps_per_second': 4.157,
 'epoch': 1.0}

In [24]:
import os
model_save_path = 'MODEL SAVE DIR !!!'
model_dir = os.path.join(model_save_path, 'BERT_transfer_model.pt')
# torch.save(model, model_dir)
# orch.save(model.state_dict(), model_dir)

In [25]:
model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
num_labels = 2

# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json from cache at /Users/choiwb/.cache/huggingface/transformers/4e60bb8efad3d4b7dc9969bf204947c185166a0a3cf37ddb6f481a876a3777b5.9f8326d0b7697c7fd57366cdde57032f46bc10e37ae81cb7eb564d66d23ec96b
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie

In [26]:
# pytorch 모델 호출
load_model = torch.load(model_dir)
# model.load_state_dict(torch.load(model_dir))
# model.eval()
load_model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [27]:
device = torch.device('mps')
device

device(type='mps')

In [28]:
load_model = load_model.to(device)
load_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [29]:
import scipy as sp

# define a prediction function
def bert_predict(x):
    tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=250, truncation=True) for v in x]).to(device)
    outputs = load_model(tv)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores[:,1]) # use one vs rest logit units

    return val

In [30]:
import shap
pytorch_bert_explainer = shap.Explainer(bert_predict, tokenizer)

In [33]:
sample_df = df.sample(1)
sample_df

Unnamed: 0,payload,label
115588,INVITE sip:100@27.101.31.129 SIP/2.0Via: SIP/2...,1


In [34]:
bert_shap_values = pytorch_bert_explainer(sample_df['payload'], fixed_context=1, batch_size=1)
bert_shap_values

Partition explainer: 2it [00:11, 11.06s/it]               


.values =
array([[-4.73997567e-01,  1.26163151e+00,  7.31256869e-01,
         3.73876240e-01,  1.30341258e-01,  5.40229314e-03,
         1.82537522e-01,  1.29210439e-01,  8.89387227e-02,
         7.51967526e-02,  3.58395673e-02,  4.46739293e-02,
         4.39163215e-03, -1.00243606e-01, -1.21181392e-02,
         6.22927762e-02,  1.12703643e-01,  8.41844386e-02,
        -1.37664968e-02,  6.26426047e-02, -5.81021005e-02,
         5.19759959e-02,  5.36339587e-02,  3.83534736e-02,
         7.93944024e-02,  5.44877672e-02, -2.47699595e-02,
        -2.23929263e-02,  2.77445936e-02,  1.28790488e-01,
         6.61533180e-02,  5.47993485e-02,  6.01020161e-02,
         4.04286686e-02,  5.00111169e-02,  3.72104234e-02,
         1.06747586e-01, -4.74504762e-03,  3.06630678e-02,
         5.22037096e-02,  3.81498999e-03,  2.66206808e-02,
         5.17342634e-02,  3.13011395e-02,  6.01731526e-02,
         8.11338014e-02,  4.04067702e-03,  4.15806360e-02,
         3.48614441e-02,  1.52271973e-02,  4.6

In [35]:
shap.text_plot(bert_shap_values, display = True)