In [13]:
# !pip install konlpy

In [10]:
# !apt-get install g++ openjdk-8-jdk python3-dev python3-pip curl -y

In [12]:
# !python3 -m pip install --upgrade pip
# !python3 -m pip install konlpy

In [2]:
# !apt-get install curl git
# !bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [1]:
from konlpy.tag import Mecab

mecab = Mecab()

In [3]:
import site
print(site.getsitepackages())

## load_data

In [3]:
import pickle as pickle
import pandas as pd
import torch

In [66]:
# Dataset 구성.
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [67]:
def preprocessing_dataset(dataset, label_type):
    label = []
    print(type(dataset))
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset

In [5]:
def load_data(dataset_dir):
  # load label_type, classes
  with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type)
  
  return dataset

In [41]:
def tokenized_dataset(dataset, tokenizer):
    concat_entity = []
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
        temp = ''
        temp = e01 + '[SEP]' + e02
        concat_entity.append(temp)
    tokenized_sentences = tokenizer(
        concat_entity,
        list(dataset['sentence']),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=100,
        add_special_tokens=True,
    )
    return tokenized_sentences

In [42]:
train_dataset = load_data("/opt/ml/input/data/train/train_renew.tsv")
train_label = train_dataset['label'].values

<class 'pandas.core.frame.DataFrame'>


In [1]:
# train_label[:10]

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model설정
model = AutoModelForSequenceClassification.from_pretrained("kykim/bert-kor-base", num_labels = 42)

# tokinizer설정
tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")

In [5]:
vocab = tokenizer.get_vocab()

for k, v in vocab.items():
    if k[0] == '[' and (not k.startswith("[un")):
        print(k,v)

# print(vocab['[CLS]'])
# print(vocab['[SEP]'])

print(len(vocab))

In [46]:
# 스페셜 토큰 추가작업
# https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html
# special_tokens_dict = {'cls_token': '<CLS>'}
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# vocab

In [20]:
num_added_toks = tokenizer.add_tokens(['[ENT]'])
print('We have added', num_added_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer))

In [19]:
vocab = tokenizer.get_vocab()
print(len(vocab))
print(vocab['[CLS]'])

In [6]:
tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")
model = AutoModelForSequenceClassification.from_pretrained("kykim/bert-kor-base")

num_added_toks = tokenizer.add_tokens(['ex1', 'ex2'])
print('We have added', num_added_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer))

In [54]:
tokenized_train = tokenized_dataset(train_dataset, tokenizer)

In [7]:
print(type(tokenized_train))

In [8]:
tokenized_train[0]
print(tokenized_train[0].tokens)
print(tokenized_train[0].ids)

In [58]:
tokenized_train = tokenized_dataset(train_dataset, tokenizer)

In [9]:
tokenized_train[0]
print(tokenized_train[0].tokens)
print(tokenized_train[0].ids)

In [18]:
vocab = tokenizer.get_vocab()

In [10]:
for k, v in tokenized_train.items():
    print(k)
    print(v)
    break

In [68]:
RE_train_dataset = RE_Dataset(tokenized_train, train_label)

In [11]:
RE_train_dataset[0]

In [12]:
print(type(RE_train_dataset))

## 스페셜 토큰 추가하기

In [13]:
# tokinizer설정

# model설정
model = AutoModelForSequenceClassification.from_pretrained("kykim/bert-kor-base", num_labels = 42)

tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")

In [14]:
print(tokenizer.vocab_size)


In [73]:
added_token_num = 0

added_token_num += tokenizer.add_special_tokens({"additional_special_tokens":["[ENT]", "[/ENT]"]}) # 새로운 스페셜 토큰 추가 방법

In [15]:
print(len(tokenizer.get_vocab()))

In [16]:
print(model.get_input_embeddings())
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num) # 모델의 embedding layer 층 개수 늘려 주워야 함
print(model.get_input_embeddings())

In [17]:
vocab = tokenizer.get_vocab()
print(vocab['[ENT]'])
print(vocab['[/ENT]'])