In [1]:
import torch
from transformers import DistilBertTokenizerFast, EarlyStoppingCallback, DistilBertForMaskedLM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime


DEVICE_DEFAULT = 'cuda'


def get_ts():
    return datetime.utcnow().replace(microsecond=0).isoformat()
# end

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
    # end
# end


def compute_metrics(pred):
    labels = pred.label_ids.reshape(-1)
    preds = pred.predictions.argmax(-1).reshape(-1)

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    recall = recall_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
# end


model_name = 'distilbert-base-uncased'
max_length=512
output_dir = 'results'

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

labels_outdomain = ['testcase', 'targetvm', 'nimbus', 'testbed', 'usererror', 'product', 'infra']
labels_indomain = ['test','machine','nimbus','environment','user', 'product', 'infrastructure']
tokenids_indomain = tokenizer.convert_tokens_to_ids(labels_indomain)
dict_labelout_tokenid = {labelout:tokenid for labelout, tokenid in zip(labels_outdomain, tokenids_indomain)}
dict_tokenid_labelout = {tokenid:labelout for labelout, tokenid in zip(labels_outdomain, tokenids_indomain)}



train_samples = [['timestamp failed at play deploy vm efi para virtual vmxnet number timestamp task wait for message auto install is completed appear in vm log serial timestamp log task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common vm wait log message yml number fatal localhost failed attempts number censored the output has been hidden due to the fact that no log true was specified for this result changed false timestamp task testing exit due to failure task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common test rescue yml number fatal localhost failed exit testing when exit testing when fail is set to true in test case deploy vm efi para virtual vmxnet number', 'is [MASK] problem']]
valid_samples = [['timestamp failed at play deploy vm efi para virtual vmxnet number timestamp task wait for message auto install is completed appear in vm log serial timestamp log task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common vm wait log message yml number fatal localhost failed attempts number censored the output has been hidden due to the fact that no log true was specified for this result changed false timestamp task testing exit due to failure task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common test rescue yml number fatal localhost failed exit testing when exit testing when fail is set to true in test case deploy vm efi para virtual vmxnet number', 'is [MASK] problem']]

train_labels = [['timestamp failed at play deploy vm efi para virtual vmxnet number timestamp task wait for message auto install is completed appear in vm log serial timestamp log task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common vm wait log message yml number fatal localhost failed attempts number censored the output has been hidden due to the fact that no log true was specified for this result changed false timestamp task testing exit due to failure task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common test rescue yml number fatal localhost failed exit testing when exit testing when fail is set to true in test case deploy vm efi para virtual vmxnet number', 'is target problem']]
valid_labels = [['timestamp failed at play deploy vm efi para virtual vmxnet number timestamp task wait for message auto install is completed appear in vm log serial timestamp log task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common vm wait log message yml number fatal localhost failed attempts number censored the output has been hidden due to the fact that no log true was specified for this result changed false timestamp task testing exit due to failure task path home worker workspace ansible cycle photon number x update ansible vsphere gos validation common test rescue yml number fatal localhost failed exit testing when exit testing when fail is set to true in test case deploy vm efi para virtual vmxnet number', 'is target problem']]


train_encodings = tokenizer.batch_encode_plus(train_samples, truncation=True, padding=True, max_length=max_length,
                                              return_tensors='pt')
valid_encodings = tokenizer.batch_encode_plus(valid_samples, truncation=True, padding=True, max_length=max_length,
                                              return_tensors='pt')

train_labels_e = tokenizer.batch_encode_plus(train_labels, truncation=True, padding=True, max_length=max_length,
                                              return_tensors='pt')

valid_labels_e = tokenizer.batch_encode_plus(valid_labels, truncation=True, padding=True, max_length=max_length,
                                              return_tensors='pt')

train_dataset = SimpleDataset(train_encodings, train_labels_e.input_ids.tolist())
valid_dataset = SimpleDataset(valid_encodings, valid_labels_e.input_ids.tolist())

In [4]:
train_labels_e.input_ids

tensor([[  101,  2335, 15464,  2361,  3478,  2012,  2377, 21296,  1058,  2213,
          1041,  8873, 11498,  7484,  1058, 22984,  7159,  2193,  2335, 15464,
          2361,  4708,  3524,  2005,  4471,  8285, 16500,  2003,  2949,  3711,
          1999,  1058,  2213,  8833,  7642,  2335, 15464,  2361,  8833,  4708,
          4130,  2188,  7309,  2573, 15327,  2019, 19307,  5402, 26383,  2193,
          1060, 10651,  2019, 19307,  5443, 27921,  2063,  2175,  2015, 27354,
          2691,  1058,  2213,  3524,  8833,  4471,  1061, 19968,  2193, 10611,
          2334, 15006,  2102,  3478,  4740,  2193,  8292, 29577,  2098,  1996,
          6434,  2038,  2042,  5023,  2349,  2000,  1996,  2755,  2008,  2053,
          8833,  2995,  2001,  9675,  2005,  2023,  2765,  2904,  6270,  2335,
         15464,  2361,  4708,  5604,  6164,  2349,  2000,  4945,  4708,  4130,
          2188,  7309,  2573, 15327,  2019, 19307,  5402, 26383,  2193,  1060,
         10651,  2019, 19307,  5443, 27921,  2063,  

In [3]:
model = DistilBertForMaskedLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir=output_dir,  # output directory
    num_train_epochs=1,  # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,  # batch size for evaluation
    warmup_steps=0,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    # load the best model when finished training (default metric is loss)    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=1,  # log & save weights each logging_steps
    evaluation_strategy="epoch",  # evaluate each `logging_steps`
    learning_rate=2e-5,
    save_strategy='no',
    metric_for_best_model='f1'
)

trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=valid_dataset,  # evaluation dataset
    compute_metrics=compute_metrics  # the callback that computes metrics of interest
)

print('[{}] start training...'.format(get_ts()))
trainer.train()

***** Running training *****
  Num examples = 1
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 1


[2023-08-17T11:46:13] start training...


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.48,0.396641,0.976471,0.966292,0.966292,0.932584


***** Running Evaluation *****
  Num examples = 1
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1, training_loss=0.4800097346305847, metrics={'train_runtime': 2.0784, 'train_samples_per_second': 0.481, 'train_steps_per_second': 0.481, 'total_flos': 44014446360.0, 'train_loss': 0.4800097346305847, 'epoch': 1.0})