In [1]:
# you get all pretrained model name here
# https://huggingface.co/transformers/pretrained_models.html
import json
import pandas as pd
import os
import torch
import transformers
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
# from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [2]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
# model_name = "bert-base-uncased"
model_name = "distilbert-base-uncased"
#model_dir = 'model_esxdeploy_filter_distilbert_1'
model_dir = 'model_esxdeploy_filter_distilbert__nonumber_1'
path_data = 'datasource/esxdeploy_20220512_ancher_ri_lines_all_nonumber.json'
max_length = 512
target_names = ['irrelevant','relevant']

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [4]:
def load_content(path_data):
    with open(path_data, 'r') as file:
        dict_filename_content = json.load(file)
    # end
    
    return dict_filename_content
# edn

In [5]:
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))
if os.path.exists(model_dir) and len(os.listdir(model_dir)) > 0:
    print('load model from local')
    model_info = model_dir
else:
    print('load model from official')
    model_info = model_name
# end

model = DistilBertForSequenceClassification.from_pretrained(model_info, num_labels=len(target_names))
model = model.cuda()

load model from official


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [6]:
dict_filename_content = load_content(path_data)

In [7]:
print(len(dict_filename_content))

103


In [8]:
contents_predicted = []

for content in list(dict_filename_content.values()):
    content['lines_conf_raw'] = []
    content['lines_label'] = []
    
    ancher = content['lines_target'][-1]
    documents = content['lines_target'][:-1]
    content['ancher_from_doc'] = ancher
    content['lines_target'] = documents
    
    for document in documents:
        pair_sample = (ancher, document)
    
        input_tokenized = tokenizer.encode_plus(pair_sample, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to('cuda')
        with torch.no_grad():
            out = model(**input_tokenized, output_hidden_states=True, output_attentions=True)
        # end

        probas_evaluate = torch.nn.functional.softmax(out.logits, dim=-1)
        answer_evaluate = int(probas_evaluate.argmax().cpu())

        label_evaluate = target_names[answer_evaluate]
        # list_conf.append(max(probas_evaluate.cpu().numpy().tolist()[0]))
        content['lines_conf_raw'].append(probas_evaluate.cpu().numpy().tolist()[0])
        content['lines_label'].append(label_evaluate)
    # end
        
    contents_predicted.append(content)
# end

In [9]:
with open('datasource/esxdeploy_20220512_ancher_ri_lines_all_nonumber_predicted_20220707.json', "w+") as file:
    file.write(json.dumps(contents_predicted, indent=4))
# end