In [1]:
import os
import pickle

In [4]:
CACHE_PATH = "./cache"

TRAIN_CACHE_FN = "train-ner-koelectra-data.cache"
VALID_CACHE_FN = "valid-ner-koelectra-data.cache"
TEST_CACHE_FN = "test-ner-koelectra-data.cache"

LABEL_FN = "./label/ner.label"

MAX_SEQ_LEN = 256

In [5]:
with open(LABEL_FN, "rb") as fp:
    data = pickle.load(fp)

In [6]:
labels = []

for key, _ in data["l2i"].items():
    if "-" in key:
        label = "".join(key.split("-")[1:])

        labels.append(label)
    
labels = list(set(labels))

print("Length of labels", len(labels))
print("Label List : ", labels)

Length of labels 15
Label List :  ['EV', 'OG', 'AM', 'FD', 'TM', 'TI', 'TR', 'CV', 'MT', 'LC', 'PT', 'AF', 'DT', 'PS', 'QT']


In [14]:
for cache_fn in [TRAIN_CACHE_FN, VALID_CACHE_FN, TEST_CACHE_FN]:
    
    print("Current file : " + cache_fn)
    
    with open(os.path.join(CACHE_PATH, cache_fn), "rb") as fp:
        data_list = pickle.load(fp)
    
    len_tokens, len_labels = [], []
    unk_count = 0
    
    for data in data_list:
        tokens, labels = data["tokens"], data["labels"]
        
        len_tokens.append(len(tokens))
        len_labels.append(len(labels))
        
        if "[UNK]" in tokens:
            unk_count += 1
        
    print(max(len_tokens))
    print(unk_count, len(len_tokens))
    

Current file : train-ner-koelectra-data.cache
211
833 164251
Current file : valid-ner-koelectra-data.cache
183
102 20528
Current file : test-ner-koelectra-data.cache
200
101 20545


In [1]:
from transformers import AutoTokenizer

In [2]:
electra_tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [3]:
electra_tokenizer.cls_token_id

2

In [4]:
electra_tokenizer.convert_tokens_to_ids(['[CLS]', '한국어', 'EL', '##EC', '##TRA', '##를', '공유', '##합니다', '.', '[SEP]'])

[2, 11229, 29173, 13352, 25541, 4110, 7824, 17788, 18, 3]

In [5]:
roberta_tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

In [6]:
roberta_tokenizer.cls_token_id

0

In [7]:
roberta_tokenizer.convert_tokens_to_ids(['[CLS]', '한국어', 'EL', '##EC', '##TRA', '##를', '공유', '##합니다', '.', '[SEP]'])

[0, 9187, 23565, 10990, 20707, 2138, 5194, 11800, 18, 2]