In [4]:
import os, errno
import json
import yaml
import argparse
import sys
import traceback
import time
import pandas as pd
import torch
import torch.utils.data as data_utils
import transformers
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# default constants

VBERT_MODEL = "vBERT-2020-Base"
CLASSIFICATION_LAYER_WIDTH= 768 if "Base" in VBERT_MODEL else 1024
MAX_STRING_LEN = 128

In [5]:
path_data_base = 'data'
name_testset = 'importance_detection_testset_200_12000.csv'
path_testset = os.path.join(path_data_base, name_testset)

In [6]:
path_model_base = 'models-instaml-finetune'
version_model = 'v1216'
name_file_config = '.model.json'
path_model = os.path.join(path_model_base, version_model)
path_config = os.path.join(path_model, name_file_config)

with open(path_config, 'r') as file:
    config_model = json.load(file)
# end

for k, v in config_model.get('bert').items():
    config_model[k] = v
# end

config_model['root'] = path_model

del config_model['allmetrics']
del config_model['metrics']

In [7]:
device = 'cuda'

In [8]:
class BertClassifier(torch.nn.Module):

    def __init__(self, config):
        super(BertClassifier, self).__init__()
        self.config = config
        self.model_dir = config.get("root", ".")
        self.model_name = config.get("name", VBERT_MODEL)
        self.classification_layer_width = config.get("input_size", CLASSIFICATION_LAYER_WIDTH)
        self.classes = config.get("classes", ["0", "1"])
        self.num_classes = config.get("output_size", len(self.classes))
        self.max_string_length = config.get("max_length", MAX_STRING_LEN)
        self.bert_config_file = self.model_dir + "/bert_config.json"
        self.model_file_path = self.model_dir + "/model.pt"
        self.device = config.get("device", "cpu")
#         self.device = 'cuda'
        self.metrics = [ "accuracy", "precision", "recall", "F-score" ]

        # Load the vBERT vocabulary into the tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(self.model_dir)
    # end

    def load_bert_for_training(self):
        print('Loading vBERT model: ' + self.model_name)
        self.l1 = BertModel.from_pretrained(pretrained_model_name_or_path=self.config.bert_model_dir)
        print("Adding {}x{} classification layer".format(self.classification_layer_width, self.num_classes))
        self.classifier = torch.nn.Linear(self.classification_layer_width, self.num_classes)

    def load_bert_for_inference(self):
        print('Loading vBERT config')
        self.l1 = BertModel(BertConfig.from_pretrained(self.bert_config_file))
        print("Adding {}x{} classification layer".format(self.classification_layer_width, self.num_classes))
        self.classifier = torch.nn.Linear(self.classification_layer_width, self.num_classes)

    # Encode the input with vBERT, read output from the last layer of vBERT, and pass to the classification layer
    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
#         print(output_1.keys())
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0,:]
#         pooler = hidden_state[:, 0]
#         print(pooler.numpy()[0][:5])
        output = self.classifier(pooler)
        return output

#     def do_inference(self, data_loader, test_output=None):
#         model = self.eval()
#         predictions = []
#         real_values = []
#         with torch.no_grad():
#             for data in data_loader:
#                 input_ids = data['ids'].to(self.device)
#                 attention_mask = data['mask'].to(self.device)
#                 targets = data['targets'].to(self.device)
#                 outputs = model(input_ids, attention_mask)
#                 _, preds = torch.max(outputs, dim=1)
#                 predictions.extend(preds)
#                 real_values.extend(targets)
#                 if test_output != None:
#                     test_output.extend(outputs)
#         predictions = torch.stack(predictions).cpu()
#         real_values = torch.stack(real_values).cpu()

#         test_accu = 100 * accuracy_score(real_values, predictions)
#         test_precision, test_recall, test_fscore, ignore = precision_recall_fscore_support(real_values, predictions, average='macro')
#         test_precision *= 100
#         test_recall *= 100
#         test_fscore *= 100
#         metrics = [ test_accu, test_precision, test_recall, test_fscore ]
#         return predictions, real_values, metrics
#     # end
# end

In [9]:
class Prepare(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, classes):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.classes = classes
        
        self.index_class_label = {klass:index for index, klass in enumerate(self.classes)}
    # end

    def __getitem__(self, index):
        title = str(self.data['log'][index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        label = self.index_class_label[self.data['pcc'][index]]

#         target = torch.tensor(label, dtype=torch.long)
        target = label

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'targets': label
        }

    def __len__(self):
        return self.len
    # end
# end


In [10]:
from tqdm import tqdm

# def classify_text(model, txt, conf=None):
#     tokenizer = model.tokenizer
#     max_string_length = model.max_string_length
    
#     dataset = pd.DataFrame.from_csv(path_testset)
#     prepared_set = Prepare(dataset, tokenizer, max_string_length)
#     params = {'batch_size': 1,
#                 'shuffle': False,
#                 'num_workers': 0
#             }
#     loader = DataLoader(prepared_set, **params)
#     outputs = []
#     predictions, real_values, metrics = self.do_inference(loader, outputs)
#     classes = model.classes
#     result = { 'classes': [ ], 'text': txt, 'top_class': classes[int(predictions[0])], 'top_class_index': int(predictions[0]) }
#     xi = 0
#     for x in classes:
#         result['classes'].append({ 'class_name': x, 'confidence': float(outputs[0][xi]) })
#         xi += 1
#     return result


#     dataset_eval = Dataset(train_texts + valid_texts, train_labels + valid_labels)
#     dataload_eval = torch.utils.data.DataLoader(dataset_eval, batch_size=4, shuffle=True)

#     y_hats_all = []
#     y_true_all = []
#     for _valid_texts, _valid_labels in tqdm(dataload_eval):
#         outputs = predict_minibatch(model, _valid_texts)
#         y_hats = outputs['y_hats'].tolist()
#         y_true = _valid_labels.detach().cpu().numpy().tolist()   
#         y_hats_all += y_hats
#         y_true_all += y_true
#     # end

#     print('precision_score: {}'.format(precision_score(y_true_all, y_hats_all)))
#     print('recall_score: {}'.format(recall_score(y_true_all, y_hats_all)))
#     print('f1_score: {}'.format(f1_score(y_true_all, y_hats_all)))
# # end

In [8]:
# df = pd.read_csv(path_testset)

In [9]:
# df[:3]

Unnamed: 0,pc,pcc,log
0,comment,comment,timestamp 5776059 admission failure in path ho...
1,comment,comment,timestamp cpu 2 2156005 os fs os fs get mount ...
2,comment,comment,timestamp hex id debug id hex id vsan ctl get ...


In [11]:
model = BertClassifier(config_model)
model.load_bert_for_inference()
model.to(device)
model.load_state_dict(torch.load(model.model_file_path, map_location=torch.device(device)))

Loading vBERT config
Adding 768x2 classification layer


<All keys matched successfully>

In [12]:
dataset = pd.read_csv(path_testset)
dataset = dataset[:10]

In [21]:
max_length = model.max_string_length
classes = model.classes
tokenizer = model.tokenizer

prepared_set = Prepare(dataset, tokenizer, max_length, classes)
params_dataloader = {
    'batch_size': 2,
    'shuffle': False,
    'num_workers': 0
}
data_loader = DataLoader(prepared_set, **params_dataloader)
y_hats_all = []
y_true_all = []
ids_top3 = []
outputs_all = []

with torch.no_grad():
    for data in tqdm(data_loader):
#         ids_top3.append(data['ids'].cpu().numpy()[0][:3])
        input_ids = data['ids'].to(device)
        attention_mask = data['mask'].to(device)
        outputs = model(input_ids, attention_mask)
        outputs_all.append(outputs)
#         print(outputs)
        _, preds = torch.max(outputs, dim=1)
        y_hats_all.append(preds.cpu().numpy()[0])
        y_true_all.append(data['targets'][0])
    # end
# end

100%|██████████| 5/5 [00:00<00:00, 26.88it/s]


In [22]:
outputs_all[0]

tensor([[-1.8095,  1.9613],
        [ 2.8513, -2.6571]], device='cuda:0')

In [24]:
outputs_all[0].cpu().numpy()

array([[-1.8094643,  1.9612609],
       [ 2.851346 , -2.6570942]], dtype=float32)

In [26]:
import numpy as np
vectors_all = []
for outputs in outputs_all:
    for vector_predict in outputs.cpu().numpy():
        vectors_all.append(vector_predict.tolist())
    # end
# end
vectors_all

[[-1.8094643354415894, 1.9612609148025513],
 [2.851346015930176, -2.6570942401885986],
 [1.0022659301757812, -0.46435776352882385],
 [0.9951595664024353, -0.7257012128829956],
 [-0.6877875924110413, 0.6506273746490479],
 [1.5641168355941772, -1.6831454038619995],
 [1.7741361856460571, -1.6320698261260986],
 [1.0426920652389526, -0.3897593915462494],
 [0.8489823937416077, -0.6762673258781433],
 [2.16194486618042, -1.6360777616500854]]

In [19]:
prepared_set.classes

['safe', 'comment']

In [30]:
print('accuracy_score: {}'.format(accuracy_score(y_true_all, y_hats_all)))
print('precision_score: {}'.format(precision_score(y_true_all, y_hats_all, zero_division=1)))
print('recall_score: {}'.format(recall_score(y_true_all, y_hats_all, zero_division=1)))
print('f1_score: {}'.format(f1_score(y_true_all, y_hats_all, zero_division=1)))
# print(ids_top3)

100%|██████████| 12546/12546 [04:13<00:00, 49.50it/s]


accuracy_score: 0.9350390562729157
precision_score: 0.07934336525307797
recall_score: 0.29
f1_score: 0.12459720730397421


In [29]:
len(y_hats_all)

1000

In [31]:
with open('y_hats_all.txt', 'w+') as file:
    file.write(json.dumps([int(x) for x in y_hats_all]))

In [32]:
with open('y_true_all.txt', 'w+') as file:
    file.write(json.dumps([int(x) for x in y_true_all]))

In [33]:
print('accuracy_score: {}'.format(accuracy_score(y_true_all, y_hats_all)))
print('precision_score: {}'.format(precision_score(y_true_all, y_hats_all, zero_division=1, average='macro')))
print('recall_score: {}'.format(recall_score(y_true_all, y_hats_all, zero_division=1, average='macro')))
print('f1_score: {}'.format(f1_score(y_true_all, y_hats_all, zero_division=1, average='macro')))

accuracy_score: 0.9350390562729157
precision_score: 0.5336623724276393
recall_score: 0.6177442086505751
f1_score: 0.5454325798946923
