In [1]:
import os
import torch
import transformers
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
model_name = "models-vbert-finetune"
model_dir = 'models-vbert-finetune'
# max sequence length for each document/sentence sample
max_length = 512

In [3]:
# tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [4]:
def read_passages(path_data, test_size=0.2):
    df = pd.read_csv(path_data)[:1000]
    documents = df['log'].to_list()
    labels_str = df['pcc'].to_list()
    labels_list = sorted(list(set(labels_str)))
    labels_all = {l:idx for idx, l in enumerate(labels_list)}
    labels = [labels_all[label_str] for label_str in labels_str]
    print(len(documents))
    print(len(labels))
    return train_test_split(documents, labels, test_size=test_size), labels_list
# end

In [5]:
dir_data = 'data'
name_data_file = 'importance_detection_testset_200_12000.csv'
path_data_relative = os.path.join(dir_data, name_data_file)
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_passages(path_data_relative)

1000
1000


In [6]:
if len(os.listdir(model_dir)) > 0:
    print('load model from local')
    model_info = model_dir
else:
    print('load model from official')
    model_info = model_name
    
# model = DistilBertForSequenceClassification.from_pretrained(model_info)
model = BertForSequenceClassification.from_pretrained(model_info)
if torch.cuda.is_available():
    model = model.cuda()

load model from local


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [8]:
def predict_minibatch(model, lines_inputs):
        inputs = tokenizer.batch_encode_plus(
            [line_inputs for line_inputs in lines_inputs],
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        
        # Check and send to cuda (GPU) if available
        if torch.cuda.is_available():
           for tensor in inputs:
               inputs[tensor] = inputs[tensor].cuda()
            # end
        # end
        
        with torch.no_grad():  # remove this if you need gradients.
#             out: transformers.modeling_outputs.SequenceClassifierOutput = self.model(**inputs)
            out = model(**inputs, output_hidden_states=True, output_attentions=True)
        # end
        
        # print(out)
        
        batched_outputs = {
            "y_preds": torch.nn.functional.softmax(out.logits, dim=-1),
            "y_hats": torch.argmax(out.logits, dim=-1),
            "input_ids": inputs["input_ids"],
            "ntok": torch.sum(inputs["attention_mask"], dim=1),
            "cls_emb": out.hidden_states[-1][:, 0],  # last layer, first token
        }
        


        # Return as NumPy for further processing.
        detached_outputs = {k: v.cpu().numpy() for k, v in batched_outputs.items()}
        # Unbatch outputs so we get one record per input example.
        # for output in utils.unbatch_preds(detached_outputs):
        #     ntok = output.pop("ntok")
        #     output["tokens"] = self.tokenizer.convert_ids_to_tokens(output.pop("input_ids")[1:ntok - 1])
        #     yield output
        # # end
        return detached_outputs
    # end

In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    # end
    
    def classes(self):
        return self.labels
    # end
    
    def __len__(self):
        return len(self.labels)
    # end
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    # end
    
    def get_batch_texts(self, idx):
        return self.texts[idx]
    # end
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y
    # end

In [10]:
print(len(valid_labels))

200


In [11]:
from tqdm import tqdm
dataset_eval = Dataset(train_texts + valid_texts, train_labels + valid_labels)
dataload_eval = torch.utils.data.DataLoader(dataset_eval, batch_size=4, shuffle=True)

y_hats_all = []
y_true_all = []
for _valid_texts, _valid_labels in tqdm(dataload_eval):
    outputs = predict_minibatch(model, _valid_texts)
    y_hats = outputs['y_hats'].tolist()
    y_true = _valid_labels.detach().cpu().numpy().tolist()   
    y_hats_all += y_hats
    y_true_all += y_true
# end

print(y_hats_all)
print(y_true_all)
print('accuracy_score: {}'.format(accuracy_score(y_true_all, y_hats_all)))
print('precision_score: {}'.format(precision_score(y_true_all, y_hats_all, zero_division=1)))
print('recall_score: {}'.format(recall_score(y_true_all, y_hats_all, zero_division=1)))
print('f1_score: {}'.format(f1_score(y_true_all, y_hats_all, zero_division=1)))

100%|██████████| 250/250 [00:10<00:00, 24.36it/s]

[1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 


