In [2]:
import pickle as pickle
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns


from transformers import AutoTokenizer

In [3]:
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({
        'sentence':dataset[1],
        'entity_01':dataset[2],
        'entity_02':dataset[5],
        'label':label,
        'entity_01_start' : dataset[3],
        'entity_01_end' : dataset[4],
        'entity_02_start' : dataset[6],
        'entity_02_end' : dataset[7]
    })
    return out_dataset

In [4]:
def load_data(dataset_dir):
  # load label_type, classes
  with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type)
  
  return dataset

In [5]:
df = load_data("/opt/ml/input/data/train/train_renew.tsv")

# kobert

In [6]:
tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")
sentence = df['sentence'].tolist()

In [7]:
res = []

for i in range(len(sentence)):
    res.append(tokenizer(
        sentence[i],
        return_tensors="pt",
        add_special_tokens = True
    ))

encoded_sentences = []

for i in range(len(res)):
    s = [tokenizer.convert_ids_to_tokens(s) for s in res[i]['input_ids']]
    encoded_sentences.append(s[0])

In [1]:
encoded_sentences[0]

In [9]:
unk_count = 0
unk_count_list = []
for encoded_sentence in encoded_sentences:
    unk_count = 0
    for token in encoded_sentence:
        if token == '[UNK]':
            unk_count += 1
    unk_count_list.append(unk_count)

df['UNK_count'] = unk_count_list

In [3]:
sns.countplot(x = "UNK_count", data = df)

In [4]:
from collections import defaultdict
unk_count = defaultdict(int)
for v in df['UNK_count']:
    unk_count[v] += 1

for k,v in unk_count.items():
    print(str(k) + "\t" + str(v))

# multilingual

In [25]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
sentence = df['sentence'].tolist()

In [26]:
res = []

for i in range(len(sentence)):
    res.append(tokenizer(
        sentence[i],
        return_tensors="pt",
        add_special_tokens = True
    ))

encoded_sentences = []

for i in range(len(res)):
    s = [tokenizer.convert_ids_to_tokens(s) for s in res[i]['input_ids']]
    encoded_sentences.append(s[0])

In [27]:
unk_count = 0
unk_count_list = []
for encoded_sentence in encoded_sentences:
    unk_count = 0
    for token in encoded_sentence:
        if token == '[UNK]':
            unk_count += 1
    unk_count_list.append(unk_count)

df['UNK_count'] = unk_count_list
    

In [5]:
sns.countplot(x = "UNK_count", data = df)

In [6]:
from collections import defaultdict
unk_count = defaultdict(int)
for v in df['UNK_count']:
    unk_count[v] += 1

for k,v in unk_count.items():
    print(str(k) + "\t" + str(v))

# Roberta

In [18]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
sentence = df['sentence'].tolist()

In [21]:
vocab = tokenizer.get_vocab()

In [19]:
res = []

for i in range(len(sentence)):
    res.append(tokenizer(
        sentence[i],
        return_tensors="pt",
        add_special_tokens = True
    ))

encoded_sentences = []

for i in range(len(res)):
    s = [tokenizer.convert_ids_to_tokens(s) for s in res[i]['input_ids']]
    encoded_sentences.append(s[0])

In [20]:
unk_count = 0
unk_count_list = []
for encoded_sentence in encoded_sentences:
    unk_count = 0
    for token in encoded_sentence:
        if token == '<unk>':
            unk_count += 1
    unk_count_list.append(unk_count)

df['UNK_count'] = unk_count_list

In [7]:
sns.countplot(x = "UNK_count", data = df)