In [1]:
import json
import os
import sys
from datetime import datetime
import pandas as pd
import torch
from torch import BoolTensor
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



DEVICE_DEFAULT = 'cuda'


def get_ts():
    return datetime.utcnow().replace(microsecond=0).isoformat()
# end


class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
    # end


# end

def read_passages(path_data, path_label, test_size=0):
    df = pd.read_csv(path_data)

    documents = df['processed'].to_list()
    labels_str = df['target'].to_list()

    samples = documents

    with open(path_label, 'r') as file:
        labels_list = sorted(json.load(file))
    # end

    labels_all = {l: idx for idx, l in enumerate(labels_list)}

    labels = [labels_all[label_str] for label_str in labels_str]

    if test_size > 0:
        return train_test_split(samples, labels, test_size=test_size, stratify=labels, random_state=234), labels_list
    else:
        return (samples, samples, labels, labels), labels_list
    # end


# end


def compute_metrics(pred):
    labels = pred.label_ids.reshape(-1)
    preds = pred.predictions.argmax(-1).reshape(-1)

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    recall = recall_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


# end

def predict_plus(input_tokenized, model):
    masks_sample = input_tokenized.attention_mask

    indicates_sample = BoolTensor(masks_sample == 1)
    indicates_sample = indicates_sample.to(DEVICE_DEFAULT)

    out = model(**input_tokenized.to(DEVICE_DEFAULT), output_attentions=True)

    logits = out.logits.cpu()
    attentions = out.attentions[-1].cpu()

    attentions_sum = torch.masked_select((torch.sum(attentions[:, :, 0, :], 1) / attentions.shape[1])[0],
                                         indicates_sample).tolist()[1:-1]

    return logits, attentions_sum


# end

def main_train(path_train, path_label, model_name, max_length, output_dir):
    print('[{}] start main_train_and_evaluate with {}'.format(get_ts(), path_train))

    (train_samples, valid_samples, train_labels, valid_labels), target_names = read_passages(path_train, path_label, 0.1)

    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
    train_encodings = tokenizer.batch_encode_plus(train_samples, truncation=True, padding=True, max_length=max_length,
                                                  return_tensors='pt')
    valid_encodings = tokenizer.batch_encode_plus(valid_samples, truncation=True, padding=True, max_length=max_length,
                                                  return_tensors='pt')

    train_dataset = SimpleDataset(train_encodings, train_labels)
    valid_dataset = SimpleDataset(valid_encodings, valid_labels)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

    training_args = TrainingArguments(
        output_dir=output_dir,  # output directory
        num_train_epochs=20,  # total number of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=8,  # batch size for evaluation
        warmup_steps=0,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        load_best_model_at_end=True,
        # load the best model when finished training (default metric is loss)    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
        logging_steps=1,  # log & save weights each logging_steps
        evaluation_strategy="epoch",  # evaluate each `logging_steps`
        learning_rate=2e-5,
        save_strategy='epoch',
        save_total_limit=6,
        metric_for_best_model='f1'
    )

    trainer = Trainer(
        model=model,  # the instantiated Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=valid_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # the callback that computes metrics of interest
        callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
    )

    print('[{}] start training...'.format(get_ts()))
    trainer.train()

    return model, tokenizer
# end

model_name = 'distilbert-base-uncased'
max_length = 512

folder_data = 'data'
filename_data = '202205240000.csv'
filename_label = 'labels.json'
output_dir = 'results'

path_file_data = os.path.join(folder_data, filename_data)
path_file_label = os.path.join(folder_data, filename_label)

model, tokenizer = main_train(path_file_data, path_file_label, model_name, max_length, output_dir)

[2023-07-26T09:56:07] start main_train_and_evaluate with data/202205240000.csv


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

[2023-07-26T09:56:15] start training...


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2485,0.579775,0.854962,0.886217,0.856725,0.833816
2,0.0677,0.215666,0.946565,0.946578,0.947368,0.945013
3,0.0143,0.152964,0.969466,0.970635,0.969925,0.969299
4,0.0065,0.161289,0.969466,0.970635,0.969925,0.969299
5,0.0035,0.099191,0.984733,0.985714,0.984962,0.984759
6,0.0024,0.140543,0.969466,0.970635,0.969925,0.969299
7,0.0011,0.108569,0.977099,0.978571,0.977444,0.977021
8,0.0018,0.153244,0.977099,0.978195,0.977444,0.977026
9,0.0019,0.148472,0.969466,0.969841,0.969925,0.969315
10,0.0013,0.134805,0.977099,0.978195,0.977444,0.977026


***** Running Evaluation *****
  Num examples = 131
  Batch size = 8
Saving model checkpoint to results/checkpoint-147
Configuration saved in results/checkpoint-147/config.json
Model weights saved in results/checkpoint-147/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-735] due to args.save_total_limit
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 131
  Batch size = 8
Saving model checkpoint to results/checkpoint-294
Configuration saved in results/checkpoint-294/config.json
Model weights saved in results/checkpoint-294/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1029] due to args.save_total_limit
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 131
  Batch size = 8
Saving model checkpoint to results/checkpoint-441
Configuration saved in results/checkpoint-441/config.json
Model weights saved in results/checkpoin

In [2]:
import torch
from typing import Optional, Tuple
from types import MethodType
import math
from torch import nn
from torch import BoolTensor


def replace_model_function(model):

    def forward_version_jinyuj(
            self,
            query: torch.Tensor,
            key: torch.Tensor,
            value: torch.Tensor,
            mask: torch.Tensor,
            head_mask: Optional[torch.Tensor] = None,
            output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, ...]:
        """
        Parameters:
            query: torch.tensor(bs, seq_length, dim)
            key: torch.tensor(bs, seq_length, dim)
            value: torch.tensor(bs, seq_length, dim)
            mask: torch.tensor(bs, seq_length)

        Returns:
            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
        """

        # print('[ jinyuj] run jinyuj version of forward')
        bs, q_length, dim = query.size()
        k_length = key.size(1)
        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
        # assert key.size() == value.size()

        dim_per_head = self.dim // self.n_heads

        mask_reshp = (bs, 1, 1, k_length)

        def shape(x: torch.Tensor) -> torch.Tensor:
            """separate heads"""
            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)

        def unshape(x: torch.Tensor) -> torch.Tensor:
            """group heads"""
            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)

        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)

        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
        scores = scores.masked_fill(
            mask, torch.tensor(torch.finfo(scores.dtype).min)
        )  # (bs, n_heads, q_length, k_length)

        weights = nn.functional.softmax(scores, dim=-1)  # (bs, n_heads, q_length, k_length)
        # weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)

        # Mask heads if we want to
        if head_mask is not None:
            weights = weights * head_mask

        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
        context = unshape(context)  # (bs, q_length, dim)

        context = self.out_lin(context)  # (bs, q_length, dim)
        
        self.my_param = {'w_pt': weights, 'v_pt': v, 'n_heads': self.n_heads, 'dim_hidden': dim_per_head, 'bs': bs, 'x_pt': query, 'out_pt': context}
        

        if output_attentions:
            return (context, weights)
        else:
            return (context,)
        # end
    # end

    target = model.distilbert.transformer.layer[-1].attention
    target.forward = MethodType(forward_version_jinyuj, target)
    return target
# end

def predict_plus(sample_test, tokenizer, model, target, max_length=512):
    input_tokenized = tokenizer.encode_plus(sample_test, padding=True, truncation=True, max_length=max_length,
                                        return_tensors='pt')

    masks_sample = input_tokenized.attention_mask
    indicates_sample = BoolTensor(masks_sample == 1)
    indicates_sample = indicates_sample.to(DEVICE_DEFAULT)

    with torch.no_grad():
        out = model(**input_tokenized.to(DEVICE_DEFAULT), output_attentions=True)
    # end

    logits = out.logits.cpu()
    attentions = out.attentions[-1].cpu()

    attentions_sum = torch.masked_select((torch.sum(attentions[:, :, 0, :], 1) / attentions.shape[1])[0],
                        indicates_sample).tolist()

    my_param = target.my_param

    return my_param, logits, attentions_sum()
# end

In [3]:
import numpy as np
from numpy.linalg import norm

def cosine(v_1, v_2):
    return np.dot(v_1, v_2) / (norm(v_1) * norm(v_2))
# end

def unshape(x, n_heads, dim_hidden, bs):
    return x.transpose(0, 2, 1, 3).reshape(bs, -1, n_heads * dim_hidden)
# end

def unshape_pt(x, n_heads, dim_hidden, bs):
    return x.transpose(1, 2).contiguous().view(bs, -1, n_heads * dim_hidden)
# end

def investigation(w_pt=None, v_pt=None, n_heads=None, dim_hidden=None, bs=None, **kwargs):    # w=(bs, n_heads, seq_q, seq_k), v=(bs, n_heads, seq_k, dim_hidden)
    
    w = w_pt.detach().clone().cpu().numpy()
    v = v_pt.detach().clone().cpu().numpy()
    
    embedding_cls = np.matmul(w[:,:,0,:], v)    # (bs, n_head, 1, dim_hidden)

    embedding_cls_unshape = unshape(embedding_cls, n_heads, dim_hidden, bs) # (bs, 1, dim)
    v_unshape = unshape(v, n_heads, dim_hidden, bs) # (bs, seq_k, dim*n_heads)

    # calculate original cosine
    list_cosine_cls_v = []
    for i_bs in range(bs):
        bs_cosine_cls_v = []

        for i in range(v_unshape.shape[1]):
            bs_cosine_cls_v.append(cosine(embedding_cls_unshape[i_bs, 0, :], v_unshape[i_bs, i, :]))
        # end

        list_cosine_cls_v.append(bs_cosine_cls_v)
    # end


    # calculate impact cosine
    list_cosine_cls_cls_delta = []
    list_cls_delta_unshape = []

    for i in range(v.shape[2]):
        v_current = v.copy()
        v_current[:,:,i,:] = 0
        embedding_cls_delta = np.matmul(w[:, :, 0, :], v_current)    # (bs, n_heads, 1, seq_k) x (bs, n_head, seq_k, dim_hidden)
        embedding_cls_delta_unshape = unshape(embedding_cls_delta, n_heads, dim_hidden, bs)    # (bs, 1, dim*n_heads)

        bs_cosine_cls_cls_delta = []
        for i_bs in range(bs):
            bs_cosine_cls_cls_delta.append(cosine(embedding_cls_unshape[i_bs, 0, :], embedding_cls_delta_unshape[i_bs, 0, :]))
            list_cls_delta_unshape.append(embedding_cls_delta_unshape[i_bs, 0, :])
        # end

        list_cosine_cls_cls_delta.append(bs_cosine_cls_cls_delta)
    # end

    return list_cosine_cls_v, list_cosine_cls_cls_delta, embedding_cls_unshape[i_bs, 0, :], list_cls_delta_unshape
# end

In [98]:
from scipy.special import softmax
labels = ["product", "testcase", "testbed", "usererror", "targetvm", "nimbus", "infra"]
# sample_test = 'timestamp failed at play check os full name timestamp task verify guest full name in guest info is expected fatal localhost failed guest full name in guest info microsoft windows server number number b it is not the same as expected one'
sample_test = 'failed guest full name'
sample_tokenized = tokenizer.encode_plus(sample_test, padding=True, truncation=True, max_length=max_length,
                                        return_tensors='pt')

target = replace_model_function(model)
out = model(**sample_tokenized.to(DEVICE_DEFAULT))
preds = softmax(out.logits.detach().cpu().numpy().squeeze(0))
my_param = target.my_param

In [99]:
list_cosine_cls_v, list_cosine_cls_cls_delta, embedding_cls_unshape, list_cls_delta_unshape = investigation(**my_param)

In [100]:
tokens = [tokenizer.decode(i) for i in sample_tokenized.input_ids[0]]
attention = my_param['w_pt'].clone().detach().cpu().numpy().mean(1).squeeze(0)[0,:].tolist()

In [101]:
[(a,b) for a,b in zip(labels, preds)]

[('product', 0.02952733),
 ('testcase', 0.04065845),
 ('testbed', 0.09369221),
 ('usererror', 0.12102889),
 ('targetvm', 0.024749966),
 ('nimbus', 0.6562893),
 ('infra', 0.034053694)]

In [102]:
# sorted([(a, b[0], c,d) for a, b, c,d in zip(tokens, list_cosine_cls_cls_delta, attention, list_cosine_cls_v[0])], key=lambda item: -item[1])
# [(a, b[0], c, d, c * d) for a, b, c,d in zip(tokens, list_cosine_cls_cls_delta, attention, list_cosine_cls_v[0])]
[(a, b[0], c, d) for a, b, c,d in zip(tokens, list_cosine_cls_cls_delta, attention, list_cosine_cls_v[0])]

[('[CLS]', 0.97704524, 0.05038498342037201, 0.64797926),
 ('failed', 0.9008743, 0.30575433373451233, 0.85002106),
 ('guest', 0.98406416, 0.09968140721321106, 0.8043557),
 ('full', 0.9987719, 0.04141746088862419, 0.69122773),
 ('name', 0.9926224, 0.09466409683227539, 0.7638469),
 ('[SEP]', 0.99575186, 0.4080977439880371, 0.39237162)]

In [103]:
# embedding_cls_unshape - list_cls_delta_unshape[10]

In [104]:
# embedding_cls_unshape - embedding_cls_unshape

In [105]:
def myforward(model, x_pt, out_pt):
    x_pt = x_pt.detach().clone()
    out_pt = out_pt.detach().clone()
    
    a = model.distilbert.transformer.layer[-1]
    a_attention = a.attention
    a_norm = a.sa_layer_norm
    a_ffn = a.ffn
    a_out = a.output_layer_norm

    b = model.pre_classifier
    c = model.dropout
    d = model.classifier

    sa_output = a_norm(out_pt + x_pt)
    ffn_output = a_ffn(sa_output)
    ffn_output = a_out(ffn_output + sa_output)
    out_a = ffn_output
    pooled_output = out_a[:,0]
    logits_jinyuj = d(c(nn.ReLU()(b(pooled_output))))
    preds_jinyuj = softmax(logits_jinyuj.detach().cpu().numpy().squeeze(0))

    return preds_jinyuj
# end

In [106]:
def my_outpt(target, w_pt, v_pt, n_heads, dim_hidden, bs, index_v_tuning=None):
    w_pt = w_pt.detach().clone()    # (bs, n_heads, q_length, k_length)
    v_pt = v_pt.detach().clone()    # (bs, n_hreads, k_length, dim_hidden)

    if index_v_tuning:
        v_pt[:,:,index_v_tuning,:] = 0
    # end

    z_pt = torch.matmul(w_pt, v_pt)  # (bs, n_heads, q_length, dim_hidden)
    z_pt_unshape = unshape_pt(z_pt, n_heads, dim_hidden, bs)  # (bs, q_length, dim_hidden * n_heads)

    out_pt = target.out_lin(z_pt_unshape)  # (bs, q_length, dim)
    return out_pt
# end

In [107]:
impacts = []
diffs = []

# myout_pt = my_outpt(target, my_param['w_pt'], my_param['v_pt'], my_param['n_heads'], my_param['dim_hidden'], my_param['bs'])
for i in range(my_param['v_pt'].shape[2]):
    myout_pt = my_outpt(target, my_param['w_pt'], my_param['v_pt'], my_param['n_heads'], my_param['dim_hidden'], my_param['bs'], i)
    preds_jinyuj = myforward(model, my_param['x_pt'], myout_pt)
    impacts.append(np.mean(abs(preds_jinyuj - preds)))
    diffs.append((preds - preds_jinyuj).tolist())
# end

In [108]:
labels

['product', 'testcase', 'testbed', 'usererror', 'targetvm', 'nimbus', 'infra']

In [109]:
# [('id', 'token', 'delta', 'impact', 'attention', 'cosine')] + [(i, a, b[0], c, d, e) for i, a, b, c,d, e in zip(range(len(tokens)), tokens, list_cosine_cls_cls_delta, impacts, attention, list_cosine_cls_v[0])]
# sorted([(i, a, b[0], c, d, e) for i, a, b, c,d, e in zip(range(len(tokens)), tokens, list_cosine_cls_cls_delta, impacts, attention, list_cosine_cls_v[0])], key=lambda item: -item[3])

diffs_str = []
for diff in diffs:
    diffs_str.append(['{:.8f}'.format(i) for i in diff])
# end

['#{}: {} <{}>'.format(a,b,','.join(c)) for a,b,c in zip(range(len(tokens)), tokens, diffs_str)]

['#0: [CLS] <0.00000000,0.00000000,0.00000000,0.00000000,0.00000000,0.00000000,0.00000000>',
 '#1: failed <-0.05385577,-0.04783045,-0.07102384,-0.03167168,-0.06064560,0.29308793,-0.02806073>',
 '#2: guest <-0.01016757,-0.01674532,-0.02724136,0.00564633,-0.01159035,0.06952024,-0.00942210>',
 '#3: full <-0.00424204,-0.00539367,-0.01261340,0.00117198,-0.00368822,0.02828062,-0.00351550>',
 '#4: name <-0.00781824,-0.01307989,-0.01972469,-0.00422693,-0.01221539,0.06655574,-0.00949069>',
 '#5: [SEP] <-0.00242660,-0.00338863,-0.00767665,0.00192922,-0.00222145,0.01385921,-0.00007525>']

In [110]:
from copy import copy

def predict_and_investigate_logits(sample_test, tokenizer, model):
    elements_sample_test = sample_test.split()
    preds_all = []
    tokens_removed = []
    
    for i in range(len(elements_sample_test)):
        sample_current = copy(elements_sample_test)
        tokens_removed.append(sample_current.pop(i))
        
        # print(' '.join(sample_current))
        input_tokenized = tokenizer.encode_plus(' '.join(sample_current), padding=True, truncation=True, max_length=max_length,
                                        return_tensors='pt')

        with torch.no_grad():
            out = model(**input_tokenized.to(DEVICE_DEFAULT), output_attentions=True)
        # end

        logits = out.logits.cpu()
        preds = softmax(logits.numpy().squeeze(0))
        preds_all.append(preds)
    # end
    
    return preds_all, tokens_removed
# end

In [111]:
# sample_test = 'timestamp failed at play check quiesce snapshot timestamp task check specified file status until it exists in windows guest fatal localhost ip address failed'
sample_test = 'failed guest full name'
preds_all ,tokens_removed = predict_and_investigate_logits(sample_test, tokenizer, model)
results = [np.argmax(pred_all) for pred_all in preds_all]
confidences = [pred_all[np.argmax(pred_all)] for pred_all in preds_all]
[(a,b,c) for a,b,c in zip(tokens_removed, results, confidences)]

[('failed', 5, 0.42458412),
 ('guest', 5, 0.48809105),
 ('full', 5, 0.58696437),
 ('name', 5, 0.73717934)]

In [73]:
# path_train = 'data/202205240000.csv'
# path_label = 'data/labels.json'
# (train_samples, _, train_labels, _), target_names = read_passages(path_train, path_label)

In [39]:
# samples_weird = []

# for train_sample in train_samples:
#     preds_all, tokens_removed = predict_and_investigate_logits(train_sample, tokenizer, model)
#     results = [np.argmax(pred_all) for pred_all in preds_all]
#     if len(set(results)) > 1:
#         samples_weird.append(train_sample)
#     # end
# # end

In [74]:
samples_weird

['timestamp failed at play check quiesce snapshot timestamp task check specified file status until it exists in windows guest fatal localhost ip address failed',
 'timestamp failed at play para virtual v hba device ops timestamp task run io zone test on new added disk fatal localhost ip address failed non zero return code',
 'timestamp failed at play check os full name timestamp task verify guest full name in guest info is expected fatal localhost failed guest full name in guest info microsoft windows server number number b it is not the same as expected one',
 'timestamp failed at play deploy vm bios nvme timestamp task set vm boot options fatal localhost failed efi secure boot can not be enabled when boot firmware bios vm s boot firmware currently set to bios timestamp task exit testing when exit testing when fail is set true fatal localhost failed failed to run test case deploy vm bios nvme e number e',
 'timestamp failed at play environment setup timestamp task enable guest ip hack