In [1]:
import json
import os
import sys
from datetime import datetime
import pandas as pd
import torch
from torch import BoolTensor
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 512

FILENAME_TEST = 'test.csv'
DIR_OUTPUT = 'results'

DEVICE_DEFAULT = 'cuda'

def get_ts():
    return datetime.utcnow().replace(microsecond=0).isoformat()
# end


class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
    # end
# end

def read_passages(path_data, path_label, test_size=0):
    df = pd.read_csv(path_data)

    documents = df['processed'].to_list()
    labels_str = df['target'].to_list()

    samples = documents

    with open(path_label, 'r') as file:
        labels_list = sorted(json.load(file))
    # end

    labels_all = {l: idx for idx, l in enumerate(labels_list)}

    labels = [labels_all[label_str] for label_str in labels_str]

    if test_size > 0:
        return train_test_split(samples, labels, test_size=test_size, stratify=labels, random_state=234), labels_list
    else:
        return (samples, samples, labels, labels), labels_list
    # end
# end


def compute_metrics(pred):
    labels = pred.label_ids.reshape(-1)
    preds = pred.predictions.argmax(-1).reshape(-1)

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    recall = recall_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
# end

def predict_plus(input_tokenized, model):
    masks_sample = input_tokenized.attention_mask

    indicates_sample = BoolTensor(masks_sample == 1)
    indicates_sample = indicates_sample.to(DEVICE_DEFAULT)

    out = model(**input_tokenized.to(DEVICE_DEFAULT), output_attentions=True)

    logits = out.logits.cpu()
    attentions = out.attentions[-1].cpu()

    attentions_sum = torch.masked_select((torch.sum(attentions[:, :, 0, :], 1) / attentions.shape[1])[0],
                        indicates_sample).tolist()[1:-1]

    return logits, attentions_sum
# end

def main_train_and_evaluate(name_train, path_train, path_label, path_test, path_output):
    print('[{}] start main_train_and_evaluate with {} {}'.format(get_ts(), path_train, path_test))

    model_name = MODEL_NAME
    max_length = MAX_LENGTH
    output_dir = DIR_OUTPUT

    (train_samples, valid_samples, train_labels, valid_labels), target_names = read_passages(path_train, path_label,
                                                                                             0.1)

    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
    train_encodings = tokenizer.batch_encode_plus(train_samples, truncation=True, padding=True, max_length=max_length,
                                                  return_tensors='pt')
    valid_encodings = tokenizer.batch_encode_plus(valid_samples, truncation=True, padding=True, max_length=max_length,
                                                  return_tensors='pt')

    train_dataset = SimpleDataset(train_encodings, train_labels)
    valid_dataset = SimpleDataset(valid_encodings, valid_labels)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

    training_args = TrainingArguments(
        output_dir=output_dir,  # output directory
        num_train_epochs=12,  # total number of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=8,  # batch size for evaluation
        warmup_steps=0,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        load_best_model_at_end=True,
        # load the best model when finished training (default metric is loss)    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
        logging_steps=1,  # log & save weights each logging_steps
        evaluation_strategy="epoch",  # evaluate each `logging_steps`
        learning_rate=2e-5,
        save_strategy='epoch',
        save_total_limit=6,
        metric_for_best_model='f1'
    )

    trainer = Trainer(
        model=model,  # the instantiated Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=valid_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # the callback that computes metrics of interest
        callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
    )

    # trainer = Trainer(
    #     model=model,  # the instantiated Transformers model to be trained
    #     args=training_args,  # training arguments, defined above
    #     train_dataset=train_dataset,  # training dataset
    #     eval_dataset=valid_dataset,  # evaluation dataset
    #     compute_metrics=compute_metrics
    # )

    print('[{}] start training...'.format(get_ts()))
    trainer.train()

    info_state_model = trainer.evaluate()
    print('[{}] finish training.'.format(get_ts()))

    ################## start to do eval ##################

    
    return model
# end


In [2]:
path_folder_train = 'data/training'
path_test = 'data/test_from_train.csv'
path_label = 'data/labels.json'
path_output = 'data/output_noseed_train'

import numpy as np
import random
import torch

seed_val = 234
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

import os
import subprocess

filenames = [filename for filename in os.listdir(path_folder_train) if filename[0] != '.']
filename = filenames[0]
path_train = os.path.join(path_folder_train, filename)
name_train = filename.split('.')[0]

model = main_train_and_evaluate(name_train, path_train, path_label, path_test, path_output)

[2023-07-26T05:53:55] start main_train_and_evaluate with data/training/202205240000.csv data/test_from_train.csv


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

KeyboardInterrupt: 

In [None]:
# model_name = MODEL_NAME
# max_length = MAX_LENGTH

# (samples_test, _, indexs_label_test, _), target_names = read_passages(path_test, path_label, 0)
# labels_test = [target_names[index_label_test] for index_label_test in indexs_label_test]
# list_corpus_test = list(zip(samples_test, labels_test))

In [None]:
# list_corpus_test_with_index = [[index, corpus] for index, corpus in enumerate(list_corpus_test)]

In [None]:
import copy
def insert_token_to_sentence(token, sentence):
    sentences_new = [sentence]

    words_sentence = sentence.split()
    for i in range(len(words_sentence)+1):
        words_sentence_new = copy.copy(words_sentence)
        words_sentence_new.insert(i, token)
        sentences_new.append(' '.join(words_sentence_new))
    # end

    return sentences_new
# end

In [None]:
model_name = MODEL_NAME
max_length = MAX_LENGTH
sample_test = 'timestamp failed at play mouse driver vm tools timestamp task wait for getting vm test vm ip address on esxi ip address fatal localhost un reachable failed to create temporary directory in some cases you may have been able to authenticate and did not have permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask number mkdir p echo vmfs volumes data store number mkdir echo vmfs volumes data store number ansible tmp hex id number timestamp echo ansible tmp hex id number timestamp echo vmfs volumes data store number ansible tmp hex id number timestamp exited with result number'

# sample_test = 'permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask'
# sample_test = 'permissions on the target directory consider changing the remote tmp path in ansible configuration to a rooted in tmp for more error information use v v v failed command was u mask' # nopath, all testcase
token_target = 'number'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)


# sample_test = 'permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask'
# sample_test = 'permissions target directory consider changing remote tmp path  ansible configuration a path rooted in tmp more information use v failed command was mask' # no second path
# sample_test = 'permissions target directory consider changing remote ansible configuration a path rooted in tmp more information use v failed mask' # no second path -> infra
# sample_test = 'permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask'
# sample_test = 'v v v failed command was u'


samples_test = insert_token_to_sentence(token_target, sample_test)
samples_test = [samples_test[0]]

list_out_pred = []
for sample_test in samples_test:
    input_tokenized = tokenizer.encode_plus(sample_test, padding=True, truncation=True, max_length=max_length,
                                        return_tensors='pt')

    masks_sample = input_tokenized.attention_mask
    indicates_sample = BoolTensor(masks_sample == 1)
    indicates_sample = indicates_sample.to(DEVICE_DEFAULT)

    with torch.no_grad():
        out = model(**input_tokenized.to(DEVICE_DEFAULT), output_attentions=True)
    # end

    logits = out.logits.cpu()
    # attentions = out.attentions[-1].cpu()
    attentions = out.attentions[0].cpu()
    probas_evaluate = torch.nn.functional.softmax(logits, dim=-1)[0]

    answer_evaluate = int(probas_evaluate.argmax())
    label_evaluate = target_names[answer_evaluate]
    probas_max = probas_evaluate[answer_evaluate]
    
    tokens = [tokenizer.decode(id) for id in input_tokenized[0].ids][1:-1]
    
    attentions_sum = torch.masked_select((torch.sum(attentions[:, :, 0, :], 1) / attentions.shape[1])[0], indicates_sample).tolist()[1:-1]
    # attentions_sum = torch.masked_select(attentions[0, 0, 0, :], indicates_sample).tolist()[1:-1] # 1batch(0), 12 heads(n), 1cls(0)
    
    pairs_token_attention = list(zip(list(range(1, len(tokens)+1)),tokens, attentions_sum))
    
    list_out_pred.append((label_evaluate, float(probas_max), sample_test, pairs_token_attention))
# end

list_out_pred

In [None]:
# cosine old version, which is not correct
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
# sample_test = 'permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask'
# sample_test = 'permissions target directory consider changing remote ansible configuration a path rooted in tmp more information use v failed mask' # no second path

input_tokenized = tokenizer.encode_plus(sample_test, padding=True, truncation=True, max_length=max_length,
                                    return_tensors='pt')

masks_sample = input_tokenized.attention_mask
indicates_sample = BoolTensor(masks_sample == 1)
indicates_sample = indicates_sample.to(DEVICE_DEFAULT)

with torch.no_grad():
    out = model(**input_tokenized.to(DEVICE_DEFAULT), output_attentions=True)
# end

logits = out.logits.cpu()
# attentions = out.attentions[-1].cpu()
attentions = out.attentions[0].cpu()
probas_evaluate = torch.nn.functional.softmax(logits, dim=-1)[0]

answer_evaluate = int(probas_evaluate.argmax())
label_evaluate = target_names[answer_evaluate]
probas_max = probas_evaluate[answer_evaluate]

# calculate attention
tokens = [tokenizer.decode(id) for id in input_tokenized[0].ids][1:-1]
attentions_sum = torch.masked_select((torch.sum(attentions[:, :, 0, :], 1) / attentions.shape[1])[0], indicates_sample).tolist()[1:-1]


cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

list_cos = []
for i in range(attentions.shape[-1]):
    embedding_cls_a = torch.sum(attentions[0, :, :, 0], 1) / attentions.shape[1]
    embedding_cls_b = torch.sum(attentions[0, :, :, i], 1) / attentions.shape[1]
    sim_cos = float(cos(embedding_cls_a, embedding_cls_b))
    list_cos.append(sim_cos)
# end

list_cos = list_cos[1:-1]

list_factor = np.array(attentions_sum) * np.array(list_cos)
list_factor_normed = list_factor / np.linalg.norm(list_factor)

In [None]:
# cosine new version
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

sample_test = 'timestamp failed at play mouse driver vm tools timestamp task wait for getting vm test vm ip address on esxi ip address fatal localhost un reachable failed to create temporary directory in some cases you may have been able to authenticate and did not have permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask number mkdir p echo vmfs volumes data store number mkdir echo vmfs volumes data store number ansible tmp hex id number timestamp echo ansible tmp hex id number timestamp echo vmfs volumes data store number ansible tmp hex id number timestamp exited with result number'
# sample_test = 'permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask'
# sample_test = 'v v v failed command was u'


input_tokenized = tokenizer.encode_plus(sample_test, padding=True, truncation=True, max_length=max_length,
                                    return_tensors='pt')

masks_sample = input_tokenized.attention_mask
indicates_sample = BoolTensor(masks_sample == 1)
indicates_sample = indicates_sample.to(DEVICE_DEFAULT)

with torch.no_grad():
    out = model(**input_tokenized.to(DEVICE_DEFAULT), output_attentions=True, output_hidden_states=True)
# end

logits = out.logits.cpu()
attentions = out.attentions[-1].cpu()
# attentions = out.attentions[0].cpu()
probas_evaluate = torch.nn.functional.softmax(logits, dim=-1)[0]

answer_evaluate = int(probas_evaluate.argmax())
label_evaluate = target_names[answer_evaluate]
probas_max = probas_evaluate[answer_evaluate]

# calculate attention
tokens = [tokenizer.decode(id) for id in input_tokenized[0].ids][1:-1]
attentions_sum = torch.masked_select((torch.sum(attentions[:, :, 0, :], 1) / attentions.shape[1])[0], indicates_sample)
attentions_sum_torch = torch.Tensor(attentions_sum)
    

In [None]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

hidden_states = out.hidden_states
h2 = hidden_states[-1].squeeze(0) # [24, 762]
h2 = h2.to('cpu')

In [None]:
list_cos = []
for i in range(h2.shape[0]):

    # embedding_cls_a = h2[0,:]
    embedding_cls_a = torch.matmul(attentions_sum, h2)
    embedding_cls_b = h2[i,:]
    sim_cos = float(cos(embedding_cls_a, embedding_cls_b))
    list_cos.append(sim_cos)
# end

# (list_cos[1:-1] / np.linalg.norm(list_cos[1:-1])).tolist()
list_cos[1:-1]

In [None]:
import scipy, sklearn
# scipy.special.softmax(list_cos[1:-1])
min_max_scaler = sklearn.preprocessing.MinMaxScaler()
min_max_scaler.fit_transform(list_cos[1:-1])

In [None]:
from torch import BoolTensor
import csv

model_name = MODEL_NAME
max_length = MAX_LENGTH


(samples_test, _, indexs_label_test, _), target_names = read_passages(path_test, path_label, 0)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [None]:
def split_by_n(sentence, n):
    words_sentence = sentence.split()
    list_corpus_current = [' '.join(words_sentence[i:i + n]) for i in range(len(words_sentence)+1 - n)]
    return list_corpus_current
# end

def ngram_investigation(sentence, tokenizer, model, target_names, n_gram_end=1, n_gram_start=0):
    list_list_corpus = [split_by_n(sentence, n+1) for n in range(n_gram_start, n_gram_end)]
    list_batch_encoded = [tokenizer.batch_encode_plus(list_corpus, padding=True, truncation=True, max_length=512, return_tensors='pt') for list_corpus in list_list_corpus]
    # print(list_batch_encoded[0].input_ids[0])
    # print(tokenizer.batch_decode(list_batch_encoded[0].input_ids[0]))
    
    # list_batch_encoded = [tokenizer.batch_encode_plus(list_corpus, return_tensors='pt') for list_corpus in list_list_corpus]
    list_out = []
    for batch_encoded in list_batch_encoded:
        masks_attention = batch_encoded.attention_mask
        indices_attention = BoolTensor(masks_attention == 1)
        batch_ids_input = batch_encoded.input_ids.to('cpu')
        # batch_decode = [tokenizer.batch_decode(batch_ids_input[i]) for i in range(batch_ids_input.shape[0])]
        
        
        with torch.no_grad():
            out = model(**batch_encoded.to(DEVICE_DEFAULT), output_attentions=True)
        # end
        
        # scores_attention = (torch.sum(out.attentions[-1][:,:,0,:], 1)[:,1:-1] / out.attentions[-1].shape[1]).to('cpu').tolist()
        
        scores_attention_all = (torch.sum(out.attentions[-1][:,:,0,:], 1) / out.attentions[-1].shape[1]).to('cpu')
        
        scores_attention_flat = torch.masked_select(scores_attention_all, indices_attention).tolist()
        ids_input_flat = torch.masked_select(batch_ids_input, indices_attention).tolist()
        list_num_tokens = torch.sum(masks_attention, axis=1).tolist()
        
        list_ids_input = []
        list_scores_attention = []
        
        for num_token in list_num_tokens:
            list_ids_input.append(tokenizer.batch_decode(ids_input_flat[:num_token][1:-1]))
            list_scores_attention.append(scores_attention_flat[:num_token][1:-1])
            
            ids_input_flat = ids_input_flat[num_token:]
            scores_attention_flat = scores_attention_flat[num_token:]
        # end
        
        list_corpuss_input_attention = []
        for ids_input, scores_attention in zip(list_ids_input, list_scores_attention):
            list_corpuss_input_attention.append([(id_input, score_attention) for id_input, score_attention in zip(ids_input, scores_attention)])
        # end
        
        probas_evaluate = torch.nn.functional.softmax(out.logits, dim=-1).to('cpu')
        answers_evaluate = probas_evaluate.argmax(axis=1).to('cpu')
        confidences_evaluate = probas_evaluate.gather(1, answers_evaluate.reshape(-1, 1)).tolist()
        labels_evaluate = [target_names[answer_current] for answer_current in answers_evaluate.tolist()]
        
        list_corpuss_label_confidence = []
        for label_evaluate, confidence_evaluate in zip(labels_evaluate, confidences_evaluate):
            list_corpuss_label_confidence.append((label_evaluate, confidence_evaluate[0]))
        # end
        
        # print()
        output_final = [{'input': corpuss_input_attention,'result': corpuss_label_confidence} for corpuss_input_attention, corpuss_label_confidence in zip(list_corpuss_input_attention, list_corpuss_label_confidence)]

        # list_out.append({'inputs_with_attention': list_corpuss_input_attention, 'labels_with_confidence': list_corpuss_label_confidence})
        list_out.append(output_final)
    # end
    
    return list_out
# end


def transform_ngram_report(content_origin, n_gram_start):
    list_all = []
    for ngram, content_ngram in enumerate(content_origin):
        
        for i_sentence, dict_info_sentence in enumerate(content_ngram):
            y_pred = dict_info_sentence['result'][0]
            
            conf_pred = dict_info_sentence['result'][1]

            for corpus_token_attention in dict_info_sentence['input']:
                token = corpus_token_attention[0]
                attention = corpus_token_attention[1]

                list_all.append({'ngram': ngram+n_gram_start+1, 'partition': i_sentence, 'token': token, 'attention': attention, 'predict': y_pred, 'confidence': conf_pred})
                y_pred = ''
                conf_pred = ''
                i_sentence = ''
            # end
        # end
    # end
    return list_all
# end

def write_csv(data, filename):
    with open(filename, 'w+') as outf:
        writer = csv.DictWriter(outf, data[0].keys())
        writer.writeheader()

        for row in data:
            writer.writerow(row)
        # end
    # end
# end


In [None]:
sentence = 'timestamp failed at play mouse driver vm tools timestamp task wait for getting vm test vm ip address on esxi ip address fatal localhost un reachable failed to create temporary directory in some cases you may have been able to authenticate and did not have permissions on the target directory consider changing the remote tmp path in ansible configuration to a path rooted in tmp for more error information use v v v failed command was u mask number mkdir p echo vmfs volumes data store number mkdir echo vmfs volumes data store number ansible tmp hex id number timestamp echo ansible tmp hex id number timestamp echo vmfs volumes data store number ansible tmp hex id number timestamp exited with result number'
# sentence = 'timestamp failed at play mouse driver vm tools timestamp'
content_ngram = ngram_investigation(sentence, tokenizer, model, target_names, 35, 28)

In [None]:
content_output = transform_ngram_report(content_ngram, 28)

In [None]:
[i for i in content_output if i['predict'] == 'infra']
# write_csv(content_output, 'hello.csv')