In [1]:
import os
import pickle

In [2]:
CACHE_PATH = "./cache"

TRAIN_CACHE_FN = "ner-train-data.cache"
VALID_CACHE_FN = "ner-valid-data.cache"
TEST_CACHE_FN = "ner-test-data.cache"

LABEL_FN = "./vocab/ner.label"

MAX_SEQ_LEN = 256

In [3]:
with open(LABEL_FN, "rb") as fp:
    data = pickle.load(fp)

In [4]:
labels = []

for key, _ in data["l2i"].items():
    if "-" in key:
        label = "".join(key.split("-")[1:])

        labels.append(label)
    
labels = list(set(labels))

print("Length of labels", len(labels))
print("Label List : ", labels)

Length of labels 15
Label List :  ['CV', 'TM', 'OG', 'FD', 'AM', 'LC', 'QT', 'MT', 'DT', 'PT', 'TI', 'AF', 'EV', 'TR', 'PS']


In [5]:
for cache_fn in [TRAIN_CACHE_FN, VALID_CACHE_FN, TEST_CACHE_FN]:
    
    print("Current file : " + cache_fn)
    
    with open(os.path.join(CACHE_PATH, cache_fn), "rb") as fp:
        data = pickle.load(fp)

    tokens, labels = data["tokens"], data["labels"]
    
    print("Length of tokens : ", len(tokens))
    exceed_seq_len = 0
    max_seq_len = 0
    min_seq_len = 10000
    seq_len_buffer = []

    for token in tokens:
        if len(token) > MAX_SEQ_LEN:
            exceed_seq_len += 1

        if len(token) > max_seq_len:
            max_seq_len = len(token)

        if len(token) < min_seq_len:
            min_seq_len = len(token)

        seq_len_buffer.append(len(token))


    print("Max Length : ", max_seq_len)
    print("Min Length : ", min_seq_len)
    print("Average Length : ", sum(seq_len_buffer) / len(seq_len_buffer))
    
    over_200 = 0

    for token in tokens:
        if len(token) > 200:
            over_200 += 1

    print("Over 200 length : ", over_200)
    
    
    label_dict = {}
    
    for label in labels:
        for l in label:
            if "-" in l:
                l = "".join(l.split("-")[1:])

            if l not in label_dict:
                label_dict.setdefault(l, 0)

            label_dict[l] += 1
            
    print("Frequency of Each Labels : ", label_dict)
    print()

Current file : ner-train-data.cache
Length of tokens :  298921
Max Length :  198
Min Length :  0
Average Length :  17.701459582966738
Over 200 length :  0
Frequency of Each Labels :  {'LC': 91211, 'O': 4102262, 'OG': 178455, 'AF': 68222, 'CV': 246078, 'QT': 182911, 'TM': 47786, 'PS': 141356, 'DT': 133658, 'FD': 12209, 'AM': 12136, 'EV': 39768, 'TI': 17184, 'MT': 7814, 'TR': 7061, 'PT': 3227}

Current file : ner-valid-data.cache
Length of tokens :  37357
Max Length :  241
Min Length :  0
Average Length :  17.652514923575232
Over 200 length :  2
Frequency of Each Labels :  {'O': 511822, 'DT': 16677, 'OG': 22509, 'CV': 30699, 'AM': 1420, 'PS': 17612, 'AF': 8111, 'TI': 2052, 'LC': 11374, 'EV': 5065, 'TM': 6180, 'QT': 22122, 'PT': 402, 'FD': 1560, 'MT': 994, 'TR': 846}

Current file : ner-test-data.cache
Length of tokens :  37358
Max Length :  151
Min Length :  0
Average Length :  17.69492478184057
Over 200 length :  0
Frequency of Each Labels :  {'O': 515050, 'CV': 30694, 'LC': 11212, 'OG'