In [1]:
# you get all pretrained model name here
# https://huggingface.co/transformers/pretrained_models.html
import pandas as pd
import os
import torch
import transformers
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
# from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [2]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
# model_name = "bert-base-uncased"
model_name = "distilbert-base-uncased"
model_dir = 'model_esxdeploy_filter_distilbert_1'
# max sequence length for each document/sentence sample
max_length = 512

In [3]:
# tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [4]:
def read_passages(path_data,test_size=0.1):
    df = pd.read_csv(path_data)
    
    
    anchers = df['ancher'].to_list()
    documents = df['log'].to_list()
    labels_str = df['label'].to_list()
    
#     print(len(anchers))
#     print(len(documents))
    
#     samples = list(zip(anchers, documents))
    samples = [(ancher, document) for ancher, document in zip(anchers,documents)]
    
    labels_list = sorted(list(set(labels_str)))
    labels_all = {l:idx for idx, l in enumerate(labels_list)}
    labels = [labels_all[label_str] for label_str in labels_str]
#     return samples, labels, labels_list
    return train_test_split(samples, labels, test_size=test_size), labels_list
# end

In [5]:
# call the function
dir_data = 'datasource'
name_data_file = 'esxdeploy_20220512_ancher_sample.csv'
path_data_relative = os.path.join(dir_data, name_data_file)
(train_samples, valid_samples, train_labels, valid_labels), target_names = read_passages(path_data_relative)

In [6]:
train_samples[837]

('timestamp number insufficient memory resources fault the available memory resources in the parent resource pool are insufficient for the operation',
 'timestamp number vm is on host wdc number oc vmware com before powering on')

In [7]:
train_encodings = tokenizer.batch_encode_plus(train_samples, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
valid_encodings = tokenizer.batch_encode_plus(valid_samples, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

In [8]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = SimpleDataset(train_encodings, train_labels)
valid_dataset = SimpleDataset(valid_encodings, valid_labels)

In [10]:
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))
if os.path.exists(model_dir) and len(os.listdir(model_dir) > 0):
    print('load model from local')
    model_info = model_dir
else:
    print('load model from official')
    model_info = model_name
    
model = DistilBertForSequenceClassification.from_pretrained(model_info, num_labels=len(target_names))

load model from official


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids.reshape(-1)
    # pred = np.argmax(pred, axis=1)
    preds = pred.predictions.argmax(-1).reshape(-1)

    # print('labels: {}'.format(labels))
    # print('pred: {}'.format(preds))
    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    recall = recall_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [12]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=1,               # log & save weights each logging_steps
    evaluation_strategy="epoch",     # evaluate each `logging_steps`
    learning_rate=2e-5,
    save_strategy='epoch',
    save_total_limit=5,
    metric_for_best_model='f1'
)

In [13]:
# trainer = Trainer(
#     model=model,                         # the instantiated Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     compute_metrics=compute_metrics,     # the callback that computes metrics of interest
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
# )

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 5456
  Num Epochs = 20
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 54560
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0004,0.24732,0.950577,0.925831,0.783222,0.836535
2,0.0002,0.195571,0.958814,0.907762,0.85333,0.878027
3,0.0002,0.17464,0.963756,0.899763,0.899763,0.899763
4,0.0003,0.230664,0.963756,0.933747,0.856077,0.890036
5,0.0,0.205522,0.968699,0.93269,0.887948,0.908747
6,0.0,0.287013,0.963756,0.926788,0.863358,0.891804
7,0.0,0.222094,0.973641,0.922138,0.934381,0.928144
8,0.0004,0.184602,0.976936,0.941853,0.928932,0.935268
9,0.0004,0.193771,0.978583,0.949611,0.929847,0.939442
10,0.0,0.220818,0.971993,0.92008,0.926184,0.923103


***** Running Evaluation *****
  Num examples = 607
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-2728
Configuration saved in ./results/checkpoint-2728/config.json
Model weights saved in ./results/checkpoint-2728/pytorch_model.bin
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 607
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-5456
Configuration saved in ./results/checkpoint-5456/config.json
Model weights saved in ./results/checkpoint-5456/pytorch_model.bin
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 607
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-8184
Configuration saved in ./results/checkpoint-8184/config.json
Model weights saved in ./results/checkpoint-8184/pytorch_model.bin
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
***** Running Evaluation *****
  Num exa

TrainOutput(global_step=38192, training_loss=0.07756829421391961, metrics={'train_runtime': 2074.4933, 'train_samples_per_second': 52.601, 'train_steps_per_second': 26.3, 'total_flos': 1.0118389778939904e+16, 'train_loss': 0.07756829421391961, 'epoch': 14.0})

In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 607
  Batch size = 2
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


{'eval_loss': 0.19377095997333527,
 'eval_accuracy': 0.9785831960461285,
 'eval_precision': 0.9496105772250487,
 'eval_recall': 0.9298474749294421,
 'eval_f1': 0.9394420782011434,
 'eval_runtime': 4.1776,
 'eval_samples_per_second': 145.297,
 'eval_steps_per_second': 72.768,
 'epoch': 14.0}

In [16]:
model.save_pretrained(model_dir)

Configuration saved in model_esxdeploy_filter_distilbert_1/config.json
Model weights saved in model_esxdeploy_filter_distilbert_1/pytorch_model.bin


In [17]:
import json
with open('model_priority_distilbert_1/labels.json', 'w+') as file:
    file.write(json.dumps(target_names))

FileNotFoundError: [Errno 2] No such file or directory: 'model_priority_distilbert_1/labels.json'