In [1]:
import os, errno
import json
import yaml
import argparse
import sys
import traceback
import time
import pandas as pd
import torch
import torch.utils.data as data_utils
import transformers
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# default constants

VBERT_MODEL = "vBERT-2020-Base"
CLASSIFICATION_LAYER_WIDTH = 768 if "Base" in VBERT_MODEL else 1024
MAX_STRING_LEN = 128

In [2]:
path_data_base = 'data'
name_testset = 'importance_detection_testset_200_12000.csv'
path_testset = os.path.join(path_data_base, name_testset)

In [3]:
path_model_base = 'models-instaml-finetune'
version_model = 'v1216'
name_file_config = '.model.json'
path_model = os.path.join(path_model_base, version_model)
path_config = os.path.join(path_model, name_file_config)

with open(path_config, 'r') as file:
    config_model = json.load(file)
# end

for k, v in config_model.get('bert').items():
    config_model[k] = v
# end

config_model['root'] = path_model

del config_model['allmetrics']
del config_model['metrics']

In [4]:
device = 'cpu'

In [5]:
class BertClassifier(torch.nn.Module):

    def __init__(self, config):
        super(BertClassifier, self).__init__()
        self.config = config
        self.model_dir = config.get("root", ".")
        self.model_name = config.get("name", VBERT_MODEL)
        self.classification_layer_width = config.get("input_size", CLASSIFICATION_LAYER_WIDTH)
        self.classes = config.get("classes", ["0", "1"])
        self.num_classes = config.get("output_size", len(self.classes))
        self.max_string_length = config.get("max_length", MAX_STRING_LEN)
        self.bert_config_file = self.model_dir + "/bert_config.json"
        self.model_file_path = self.model_dir + "/model.pt"
#        self.device = config.get("device", "cpu")
        self.device = 'cpu'
#         self.device = 'cuda'
        self.metrics = [ "accuracy", "precision", "recall", "F-score" ]

        # Load the vBERT vocabulary into the tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(self.model_dir)
    # end

    def load_bert_for_training(self):
        print('Loading vBERT model: ' + self.model_name)
        self.l1 = BertModel.from_pretrained(pretrained_model_name_or_path=self.config.bert_model_dir)
        print("Adding {}x{} classification layer".format(self.classification_layer_width, self.num_classes))
        self.classifier = torch.nn.Linear(self.classification_layer_width, self.num_classes)

    def load_bert_for_inference(self):
        print('Loading vBERT config')
        self.l1 = BertModel(BertConfig.from_pretrained(self.bert_config_file))
        print("Adding {}x{} classification layer".format(self.classification_layer_width, self.num_classes))
        self.classifier = torch.nn.Linear(self.classification_layer_width, self.num_classes)

    # Encode the input with vBERT, read output from the last layer of vBERT, and pass to the classification layer
    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
#         print(output_1.keys())
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0,:]
#         pooler = hidden_state[:, 0]
#         print(pooler.numpy()[0][:5])
        output = self.classifier(pooler)
        print(type(output))
        print(output)
        return output

#     def do_inference(self, data_loader, test_output=None):
#         model = self.eval()
#         predictions = []
#         real_values = []
#         with torch.no_grad():
#             for data in data_loader:
#                 input_ids = data['ids'].to(self.device)
#                 attention_mask = data['mask'].to(self.device)
#                 targets = data['targets'].to(self.device)
#                 outputs = model(input_ids, attention_mask)
#                 _, preds = torch.max(outputs, dim=1)
#                 predictions.extend(preds)
#                 real_values.extend(targets)
#                 if test_output != None:
#                     test_output.extend(outputs)
#         predictions = torch.stack(predictions).cpu()
#         real_values = torch.stack(real_values).cpu()

#         test_accu = 100 * accuracy_score(real_values, predictions)
#         test_precision, test_recall, test_fscore, ignore = precision_recall_fscore_support(real_values, predictions, average='macro')
#         test_precision *= 100
#         test_recall *= 100
#         test_fscore *= 100
#         metrics = [ test_accu, test_precision, test_recall, test_fscore ]
#         return predictions, real_values, metrics
#     # end
# end

In [6]:
class Prepare(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, classes):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.classes = classes
        
        self.index_class_label = {klass:index for index, klass in enumerate(self.classes)}
    # end

    def __getitem__(self, index):
        title = str(self.data['log'][index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        label = self.index_class_label[self.data['pcc'][index]]

#         target = torch.tensor(label, dtype=torch.long)
        target = label

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'targets': label
        }

    def __len__(self):
        return self.len
    # end
# end


In [7]:
from tqdm import tqdm

# def classify_text(model, txt, conf=None):
#     tokenizer = model.tokenizer
#     max_string_length = model.max_string_length
    
#     dataset = pd.DataFrame.from_csv(path_testset)
#     prepared_set = Prepare(dataset, tokenizer, max_string_length)
#     params = {'batch_size': 1,
#                 'shuffle': False,
#                 'num_workers': 0
#             }
#     loader = DataLoader(prepared_set, **params)
#     outputs = []
#     predictions, real_values, metrics = self.do_inference(loader, outputs)
#     classes = model.classes
#     result = { 'classes': [ ], 'text': txt, 'top_class': classes[int(predictions[0])], 'top_class_index': int(predictions[0]) }
#     xi = 0
#     for x in classes:
#         result['classes'].append({ 'class_name': x, 'confidence': float(outputs[0][xi]) })
#         xi += 1
#     return result


#     dataset_eval = Dataset(train_texts + valid_texts, train_labels + valid_labels)
#     dataload_eval = torch.utils.data.DataLoader(dataset_eval, batch_size=4, shuffle=True)

#     y_hats_all = []
#     y_true_all = []
#     for _valid_texts, _valid_labels in tqdm(dataload_eval):
#         outputs = predict_minibatch(model, _valid_texts)
#         y_hats = outputs['y_hats'].tolist()
#         y_true = _valid_labels.detach().cpu().numpy().tolist()   
#         y_hats_all += y_hats
#         y_true_all += y_true
#     # end

#     print('precision_score: {}'.format(precision_score(y_true_all, y_hats_all)))
#     print('recall_score: {}'.format(recall_score(y_true_all, y_hats_all)))
#     print('f1_score: {}'.format(f1_score(y_true_all, y_hats_all)))
# # end

In [8]:
df = pd.read_csv(path_testset)

In [9]:
df[:3]

Unnamed: 0,pc,pcc,log
0,comment,comment,timestamp 5776059 admission failure in path ho...
1,comment,comment,timestamp cpu 2 2156005 os fs os fs get mount ...
2,comment,comment,timestamp hex id debug id hex id vsan ctl get ...


In [10]:
model = BertClassifier(config_model)
model.load_bert_for_inference()
model.to(device)
model.load_state_dict(torch.load(model.model_file_path, map_location=torch.device(device)))

file models-instaml-finetune/v1216/config.json not found


Loading vBERT config
Adding 768x2 classification layer


<All keys matched successfully>

In [None]:
dataset = pd.read_csv(path_testset)
max_length = model.max_string_length
classes = model.classes
tokenizer = model.tokenizer

prepared_set = Prepare(dataset, tokenizer, max_length, classes)
params_dataloader = {
    'batch_size': 1,
    'shuffle': False,
    'num_workers': 0
}
data_loader = DataLoader(prepared_set, **params_dataloader)
y_hats_all = []
y_true_all = []
ids_top3 = []

with torch.no_grad():
    for data in tqdm(data_loader):
#         ids_top3.append(data['ids'].cpu().numpy()[0][:3])
        input_ids = data['ids'].to(device)
        attention_mask = data['mask'].to(device)
        outputs = model(input_ids, attention_mask)
#         print(outputs)
        _, preds = torch.max(outputs, dim=1)
        y_hats_all.append(preds.cpu().numpy()[0])
        y_true_all.append(data['targets'][0])
    # end
# end

print('accuracy_score: {}'.format(accuracy_score(y_true_all, y_hats_all)))
print('precision_score: {}'.format(precision_score(y_true_all, y_hats_all, zero_division=1)))
print('recall_score: {}'.format(recall_score(y_true_all, y_hats_all, zero_division=1)))
print('f1_score: {}'.format(f1_score(y_true_all, y_hats_all, zero_division=1)))
# print(ids_top3)


  0%|          | 1/12546 [00:03<10:50:37,  3.11s/it]

<class 'torch.Tensor'>
tensor([[-2.1091,  1.9549]])


  0%|          | 2/12546 [00:06<10:38:25,  3.05s/it]

<class 'torch.Tensor'>
tensor([[ 3.0900, -2.9522]])


  0%|          | 3/12546 [00:08<9:55:05,  2.85s/it] 

<class 'torch.Tensor'>
tensor([[ 1.1257, -0.5034]])


  0%|          | 4/12546 [00:10<8:44:40,  2.51s/it]

<class 'torch.Tensor'>
tensor([[ 1.0434, -0.3458]])


  0%|          | 5/12546 [00:13<9:13:05,  2.65s/it]

<class 'torch.Tensor'>
tensor([[-1.1902,  0.8427]])


  0%|          | 6/12546 [00:18<12:21:25,  3.55s/it]

<class 'torch.Tensor'>
tensor([[ 1.8433, -1.4177]])


  0%|          | 7/12546 [00:20<10:08:38,  2.91s/it]

<class 'torch.Tensor'>
tensor([[ 1.4764, -1.5382]])


  0%|          | 8/12546 [00:21<8:02:49,  2.31s/it] 

<class 'torch.Tensor'>
tensor([[ 1.2600, -0.7214]])


  0%|          | 9/12546 [00:22<6:55:08,  1.99s/it]

<class 'torch.Tensor'>
tensor([[ 0.6116, -0.1456]])


  0%|          | 10/12546 [00:24<6:10:51,  1.78s/it]

<class 'torch.Tensor'>
tensor([[ 2.1974, -2.0701]])


  0%|          | 11/12546 [00:25<5:34:49,  1.60s/it]

<class 'torch.Tensor'>
tensor([[ 2.3078, -2.4600]])


  0%|          | 12/12546 [00:26<5:14:46,  1.51s/it]

<class 'torch.Tensor'>
tensor([[ 0.9369, -0.9817]])


  0%|          | 13/12546 [00:27<4:42:52,  1.35s/it]

<class 'torch.Tensor'>
tensor([[ 1.3688, -0.8425]])


  0%|          | 14/12546 [00:28<4:26:53,  1.28s/it]

<class 'torch.Tensor'>
tensor([[ 2.1483, -2.1280]])


  0%|          | 15/12546 [00:29<4:15:59,  1.23s/it]

<class 'torch.Tensor'>
tensor([[ 3.7258, -3.8336]])


  0%|          | 16/12546 [00:31<4:14:06,  1.22s/it]

<class 'torch.Tensor'>
tensor([[ 0.6277, -0.4663]])


  0%|          | 17/12546 [00:32<4:06:38,  1.18s/it]

<class 'torch.Tensor'>
tensor([[-0.3886,  0.6661]])


  0%|          | 18/12546 [00:33<4:26:15,  1.28s/it]

<class 'torch.Tensor'>
tensor([[-0.7426,  0.6735]])


  0%|          | 19/12546 [00:34<4:10:00,  1.20s/it]

<class 'torch.Tensor'>
tensor([[ 1.0155, -0.1980]])


  0%|          | 20/12546 [00:36<4:41:09,  1.35s/it]

<class 'torch.Tensor'>
tensor([[-1.6190,  1.6832]])


  0%|          | 21/12546 [00:37<4:43:59,  1.36s/it]

<class 'torch.Tensor'>
tensor([[ 1.5935, -1.5733]])


  0%|          | 22/12546 [00:38<4:34:47,  1.32s/it]

<class 'torch.Tensor'>
tensor([[ 4.4069, -4.2766]])


  0%|          | 23/12546 [00:40<4:26:49,  1.28s/it]

<class 'torch.Tensor'>
tensor([[ 1.2966, -1.0754]])


  0%|          | 24/12546 [00:41<4:22:33,  1.26s/it]

<class 'torch.Tensor'>
tensor([[-0.1084,  0.4031]])


  0%|          | 25/12546 [00:42<4:18:43,  1.24s/it]

<class 'torch.Tensor'>
tensor([[ 3.0551, -2.9220]])


  0%|          | 26/12546 [00:43<4:22:28,  1.26s/it]

<class 'torch.Tensor'>
tensor([[ 1.6238, -1.7210]])


  0%|          | 27/12546 [00:45<4:24:48,  1.27s/it]

<class 'torch.Tensor'>
tensor([[ 2.4421, -2.6440]])


  0%|          | 28/12546 [00:46<4:38:31,  1.34s/it]

<class 'torch.Tensor'>
tensor([[0.0844, 0.1283]])


  0%|          | 29/12546 [00:47<4:17:44,  1.24s/it]

<class 'torch.Tensor'>
tensor([[ 1.0735, -1.2047]])


  0%|          | 30/12546 [00:48<4:21:56,  1.26s/it]

<class 'torch.Tensor'>
tensor([[ 0.8101, -1.0646]])


  0%|          | 31/12546 [00:50<4:18:28,  1.24s/it]

<class 'torch.Tensor'>
tensor([[ 4.1417, -3.6845]])


  0%|          | 32/12546 [00:52<5:25:07,  1.56s/it]

<class 'torch.Tensor'>
tensor([[ 1.1431, -1.3727]])


  0%|          | 33/12546 [00:53<4:56:10,  1.42s/it]

<class 'torch.Tensor'>
tensor([[ 2.3934, -2.2555]])


  0%|          | 34/12546 [00:54<4:29:47,  1.29s/it]

<class 'torch.Tensor'>
tensor([[ 1.3741, -1.4557]])


  0%|          | 35/12546 [00:55<4:36:49,  1.33s/it]

<class 'torch.Tensor'>
tensor([[-1.6378,  1.9177]])


  0%|          | 36/12546 [00:57<4:59:25,  1.44s/it]

<class 'torch.Tensor'>
tensor([[ 1.8334, -2.4273]])


  0%|          | 37/12546 [00:58<4:32:39,  1.31s/it]

<class 'torch.Tensor'>
tensor([[ 0.8727, -0.5921]])


  0%|          | 38/12546 [01:00<5:16:24,  1.52s/it]

<class 'torch.Tensor'>
tensor([[ 2.9368, -3.1556]])


  0%|          | 39/12546 [01:02<5:08:24,  1.48s/it]

<class 'torch.Tensor'>
tensor([[ 2.3122, -1.8568]])


  0%|          | 40/12546 [01:03<4:44:35,  1.37s/it]

<class 'torch.Tensor'>
tensor([[-0.2615,  0.3970]])


  0%|          | 41/12546 [01:04<4:34:30,  1.32s/it]

<class 'torch.Tensor'>
tensor([[ 1.4970, -1.2929]])


  0%|          | 42/12546 [01:06<5:41:25,  1.64s/it]

<class 'torch.Tensor'>
tensor([[ 1.7709, -1.6454]])


  0%|          | 43/12546 [01:08<5:45:15,  1.66s/it]

<class 'torch.Tensor'>
tensor([[ 2.2464, -1.9900]])


  0%|          | 44/12546 [01:13<9:01:37,  2.60s/it]

<class 'torch.Tensor'>
tensor([[-1.0964,  0.9258]])


  0%|          | 45/12546 [01:16<10:11:07,  2.93s/it]

<class 'torch.Tensor'>
tensor([[-1.6309,  1.7456]])


  0%|          | 45/12546 [01:17<5:58:22,  1.72s/it] 


In [None]:
len(y_hats_all)

In [None]:
# with open('y_hats_all.txt', 'w+') as file:
#     file.write(json.dumps([int(x) for x in y_hats_all]))

In [None]:
# with open('y_true_all.txt', 'w+') as file:
#     file.write(json.dumps([int(x) for x in y_true_all]))

In [None]:
# print('accuracy_score: {}'.format(accuracy_score(y_true_all, y_hats_all)))
# print('precision_score: {}'.format(precision_score(y_true_all, y_hats_all, zero_division=1, average='macro')))
# print('recall_score: {}'.format(recall_score(y_true_all, y_hats_all, zero_division=1, average='macro')))
# print('f1_score: {}'.format(f1_score(y_true_all, y_hats_all, zero_division=1, average='macro')))