In [1]:
# from: 20211221+/distilbert_load_eval.ipynb

# you get all pretrained model name here
# https://huggingface.co/transformers/pretrained_models.html
import os
import pandas as pd
import torch
import transformers
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
# from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
from absl import app
from absl import flags
from absl import logging
from lit_nlp import dev_server
from lit_nlp import server_flags
from lit_nlp.api import model as lit_model
from lit_nlp.api import types as lit_types
# Use the regular GLUE data loaders, because these are very simple already.
from lit_nlp.lib import utils

In [3]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
# model_name = "bert-base-uncased"
model_name = "distilbert-base-uncased"
model_dir = 'models-cpqe-label'
# max sequence length for each document/sentence sample
max_length = 512

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [5]:
def read_passages(path_data, test_size=0.1):
    df = pd.read_csv(path_data)
    documents = df['processed'].to_list()
    labels_str = df['label'].to_list()
    labels_list = sorted(list(set(labels_str)))
    labels_all = {l:idx for idx, l in enumerate(labels_list)}
    labels = [labels_all[label_str] for label_str in labels_str]
    return train_test_split(documents, labels, test_size=test_size), labels_list
# end

In [6]:
# call the function
path_folder_data = 'data'
name_data = 'log_content_4.csv'
path_data = os.path.join(path_folder_data, name_data)
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_passages(path_data)

In [7]:
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))
if len(os.listdir(model_dir)) > 1:
    info_model = model_dir
    print('redirect model to use local files {}'.format(info_model))
else:
    info_model = model_name
    print('redirect model to use official {}'.format(info_model))
model = DistilBertForSequenceClassification.from_pretrained(info_model, num_labels=len(target_names))
model = model.cuda()

redirect model to use local files models-cpqe-label


In [9]:
text_valid = valid_texts[0]
input_valid_tokenized = tokenizer(text_valid, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to('cuda')
with torch.no_grad():
    out = model(**input_valid_tokenized, output_hidden_states=True, output_attentions=True)
# end

probas_main = torch.nn.functional.softmax(out.logits, dim=-1)
answer_main = int(probas_main.argmax().cpu())

In [10]:
import functools
from typing import Any, Callable, Iterable, Optional, Sequence
from lit_nlp.components.citrus import helpers
import numpy as np
from sklearn import linear_model
from sklearn import metrics

DEFAULT_KERNEL_WIDTH = 25
DEFAULT_MASK_TOKEN = '[MASK]'
DEFAULT_NUM_SAMPLES = 256
DEFAULT_SOLVER = 'cholesky'

In [11]:
def sample_masks(num_samples: int,
                 num_features: int,
                 seed: Optional[int] = None):

  rng = np.random.RandomState(seed)
  positions = np.tile(np.arange(num_features), (num_samples, 1))
  permutation_fn = np.vectorize(rng.permutation, signature='(n)->(n)')
  permutations = permutation_fn(positions)  # A shuffled range of positions.
  num_disabled_features = rng.randint(1, num_features + 1, (num_samples, 1))
  return permutations >= num_disabled_features

In [12]:
def get_perturbations(tokens: Sequence[str],
                      masks: np.ndarray,
                      mask_token: str = '<unk>') -> Iterable[str]:
  for mask in masks:
    parts = [t if mask[i] else mask_token for i, t in enumerate(tokens)]
    yield ' '.join(parts)

In [13]:
def exponential_kernel(
    distance: float, kernel_width: float = DEFAULT_KERNEL_WIDTH) -> np.ndarray:
  return np.sqrt(np.exp(-(distance**2) / kernel_width**2))

In [38]:
tokenzier_lime = str.split
mask_token = DEFAULT_MASK_TOKEN
num_samples = DEFAULT_NUM_SAMPLES
solver = DEFAULT_SOLVER
seed = 233
alpha = 1.0
distance_fn = functools.partial(metrics.pairwise.pairwise_distances, metric='cosine')
distance_scale = 100.
# intialize stage ends

sentence = text_valid

tokens = tokenzier_lime(sentence)
masks = sample_masks(num_samples + 1, len(tokens), seed=seed)
assert masks.shape[0] == num_samples + 1, 'Expected num_samples + 1 masks.'
all_true_mask = np.ones_like(masks[0], dtype=np.bool)
masks[0] = all_true_mask

perturbations = list(get_perturbations(tokens, masks, mask_token))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  all_true_mask = np.ones_like(masks[0], dtype=np.bool)


In [39]:
distances = distance_fn(all_true_mask.reshape(1, -1), masks).flatten()
distances = distance_scale * distances
distances = kernel(distances)

NameError: name 'kernel' is not defined

In [40]:
outputs = []

for sample_text in perturbations:
    input_valid_tokenized = tokenizer(sample_text, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to('cuda')
    with torch.no_grad():
        out = model(**input_valid_tokenized, output_hidden_states=True, output_attentions=True)
    # end

    probas_main = torch.nn.functional.softmax(out.logits, dim=-1)
    answer_main = int(probas_main.argmax().cpu())
    outputs.append(answer_main)
# end

print(outputs)

[5, 5, 5, 5, 5, 0, 5, 0, 5, 5, 5, 0, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 0]


In [41]:
for i in range(1, len(outputs)):
    if outputs[i] != outputs[0]:
        outputs[i] = -1
    else:
        outputs[i] = 1
    # end
# end

outputs[0] = 1

In [42]:
model = linear_model.Ridge(
  alpha=alpha, solver=solver, random_state=seed).fit(
      masks, outputs, sample_weight=distances)

In [46]:
for word, contribution in zip(tokenzier_lime(perturbations[0]), model.coef_):
    print('{}: {}'.format(word, contribution))

timestamp: -0.023183863459707625
failed: -0.2746099995065513
at: -0.219695197312005
play: 0.054984703794530854
environment: -0.0834374812746224
setup: 0.05597365447866541
timestamp: 0.19986559562192402
task: 0.0777442306292182
check: -0.21456759375613815
vm: -0.041482927614853615
test: 0.03773525268181391
vm: -0.07966582208240552
does: -0.12941320236018386
not: 0.08789889652641769
exist: -0.0076308324193059655
task: 0.18039569330510727
path: 0.01571196899571004
home: -0.14071762399024354
worker: 0.06741362021599813
workspace: -0.018504082777068182
ansible: -0.012719973737949367
regression: -0.057050216851529084
rocky: -0.09209717736368647
linux: 0.06886728244756356
8: 0.03589163450459772
x: -0.07732930080079342
ansible: -0.047796011986324785
vsphere: 0.05030807705640052
gos: 0.16725167888378406
validation: 0.09962579683502076
environment: -0.07236017778519066
setup: -0.1690803137336273
environment: -0.15728506139767678
setup: -0.19634559645407362
yml: 0.005076664894012131
25: -0.059433