In [1]:
# you get all pretrained model name here
# https://huggingface.co/transformers/pretrained_models.html
import json
import pandas as pd
import os
import torch
import transformers
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
# from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [2]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
# model_name = "bert-base-uncased"
model_name = "distilbert-base-uncased"
model_dir = 'model_10'
# path_data = 'datasource/goscv_84_202206171000.csv'
path_data = 'datasource/goscv_115_202206171000+.csv'
max_length = 512

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [4]:
def load_content(path_data,test_size=0):
    df = pd.read_csv(path_data)

    documents = df['processed'].to_list()
    labels_str = df['target'].to_list()
    
    samples = documents

    if test_size:
        return train_test_split(samples, labels_str, test_size)
    else:
        return samples, samples, labels_str, labels_str
    # end
# end

In [5]:
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))
if os.path.exists(model_dir) and len(os.listdir(model_dir)) > 0:
    print('load model from local')
    model_info = model_dir
    with open(os.path.join(model_dir, 'labels.json'), 'r') as file:
        target_names = json.load(file)
    # end
else:
    print('load model from official')
    model_info = model_name
# end

model = DistilBertForSequenceClassification.from_pretrained(model_info, num_labels=len(target_names))
model = model.cuda()

load model from local


In [6]:
samples, _, labels, _ = load_content(path_data)

In [7]:
list_conf_output = []
list_label_output = []

for sample, label_origin in zip(samples, labels):

    document = sample
    input_tokenized = tokenizer.encode_plus(document, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to('cuda')
    with torch.no_grad():
        out = model(**input_tokenized, output_hidden_states=True, output_attentions=True)
    # end

    probas_evaluate = torch.nn.functional.softmax(out.logits, dim=-1)
    answer_evaluate = int(probas_evaluate.argmax().cpu())

    label_evaluate = target_names[answer_evaluate]

    list_conf_output.append(probas_evaluate.cpu().numpy().tolist()[0][answer_evaluate])
    list_label_output.append(label_evaluate)
# end

In [8]:
int(len([True for label_origin, label_output in zip(labels, list_label_output) if label_origin == label_output]) / len(labels) * 100)

20

In [9]:
sum(list_conf_output) / len(list_conf_output)

0.9957655082578244

In [10]:
len([conf for conf in list_conf_output if conf > 0.9]) / len(list_conf_output)

0.9826086956521739