In [1]:
import pickle as pickle
import os
import pandas as pd
import torch
from tqdm.auto import tqdm
import re

In [2]:
def preprocessing(text):
    text = text.strip()
    text = re.sub('\\\n', ' ', text)
    text = re.sub('[^가-힣ㄱ-하-ㅣa-zA-Z]', ' ', text)
    text = re.sub(' +', ' ', text)
    return text

In [1]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [4]:
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({
        'sentence':dataset[1],
        'entity_01':dataset[2],
        'entity_02':dataset[5],
        'label':label,
        'entity_01_start' : dataset[3],
        'entity_01_end' : dataset[4],
        'entity_02_start' : dataset[6],
        'entity_02_end' : dataset[7]
    })
    return out_dataset

In [5]:
def load_data(dataset_dir):
  # load label_type, classes
  with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type)
  
  return dataset

In [7]:
train_dataset = load_data("/opt/ml/input/data/train/train_renew.tsv")
train_label = train_dataset['label'].values

In [2]:
train_dataset.iloc[0]

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model설정
model = AutoModelForSequenceClassification.from_pretrained("kykim/bert-kor-base", num_labels = 42)

tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")

added_token_num = 0

added_token_num += tokenizer.add_special_tokens({"additional_special_tokens":["[ENT]", "[/ENT]"]}) # 새로운 스페셜 토큰 추가 방법

print(model.get_input_embeddings())
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num) # 모델의 embedding layer 층 개수 늘려 주워야 함
print(model.get_input_embeddings())

In [4]:
tokenized_train = tokenized_dataset(train_dataset, tokenizer)

In [None]:
tokenized_train[0]
print(tokenized_train[0].tokens)
print(tokenized_train[0].ids)

In [None]:
vocab = tokenizer.get_vocab()
print(len(vocab))