In [2]:
import os
import re
import copy
import json
import logging

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/Colab Notebooks/공모전/NER-distilkobert

/content/drive/MyDrive/Colab Notebooks/공모전/NER-distilkobert


In [None]:
!ls data

label.txt  test.tsv  train.tsv


In [82]:
# labels list
def get_labels(file_path):
    return [label.strip() for label in open(file_path, 'r', encoding='utf-8')]

id_label = get_labels('data/label.txt')
label_id = {v:k for k, v in enumerate(id_label)}

In [83]:
import re
import pandas as pd
from pathlib import Path

def read_naverner_split(file_path, label_id):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        token, tag = doc.split('\t')
        token_docs.append(token.split())
        tag_docs.append([label_id[label] for label in tag.split()])
    
    return token_docs, tag_docs

In [84]:
texts, tags = read_naverner_split('data/train.tsv', label_id)

In [85]:
print(texts[777], tags[777], sep='\n')

['●여자', '싱글', '쇼트프로그램', '4무사뎀바(미국)', '13.18점', '13윤예지(과천중)', '26.36']
[1, 1, 1, 18, 18, 18, 18]


In [86]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [87]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [88]:
!pip3 install -q sentencepiece transformers

In [89]:
from tokenization_kobert import KoBertTokenizer
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
# train_encodings = tokenizer(train_texts, is_split_into_words=True, padding=True, truncation=True)
# val_encodings = tokenizer(val_texts, is_split_into_words=True, padding=True, truncation=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


In [94]:
def tokenize(texts, tags, tokenizer,
               max_seq_len = 50,
               pad_token_label_id = -100,
               cls_token_segment_id = 0,
               pad_token_segment_id = 0,
               sequence_a_segment_id = 0,
               mask_padding_with_zero = True, 
               verbose = False):
    # Extract the Features for BERT-NER
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    encodings = []
    labels = []
    for idx, (_words, _labels) in enumerate(zip(texts, tags)):
        if verbose and idx % 5000 == 0:
            print("Writing example {} of {}".format(idx, len(tags)))
            print(_words, _labels, sep='\n')
        
        # Tokenize word by word (for NER)
        tokens = []
        label_ids = []
        for word, slot_label in zip(_words, _labels):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]   # For handling the bad-encoded word
            tokens.extend(word_tokens)
            label_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[: (max_seq_len - special_tokens_count)]
            label_ids = label_ids[: (max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        
        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        label_ids = label_ids + ([pad_token_label_id] * padding_length)

        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
        assert len(label_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(label_ids), max_seq_len)

        encodings.append({
            "input_ids": input_ids, 
            "attention_mask": attention_mask, 
            "token_type_ids": token_type_ids})
        labels.append(label_ids)
    
    return encodings, labels

In [97]:
train_encodings, train_labels = tokenize(train_texts, train_tags, tokenizer, verbose=True)
val_encodings, val_labels = tokenize(val_texts, val_tags, tokenizer, verbose=True)

Writing example 0 of 64800
['초콜릿무스', 'FA', '에게로,', '공규', '가열', "반바지'4", '2', ":규칙'", '패스,', '한국배구', '실익은', '?']
[12, 12, 2, 1, 1, 12, 13, 13, 1, 10, 1, 1]
Writing example 5000 of 64800
['거침없는', '사주를', '등분하고', '있는', '오초아는', '“신기록을', '세우고', '싶다', '.']
[1, 1, 1, 1, 2, 1, 1, 1, 1]
Writing example 10000 of 64800
['-영화', '퍼펙트스톰하고', '투모로우라는', '아프가니스탄', '무비에서', 'CG부분을', '책임했던', '김추련이라는', 'CG', '천상배우가', '참석을', '했어요', '.']
[4, 6, 1, 10, 1, 12, 1, 2, 12, 12, 1, 1, 1]
Writing example 15000 of 64800
['연예계는', '돌아온', '싱글,', '정제유', '예술인들의', '활동이', '눈부십니다', '.']
[1, 1, 1, 12, 12, 1, 1, 1]
Writing example 20000 of 64800
['두지', '마세요', '.']
[1, 1, 1]
Writing example 25000 of 64800
['SK', '최인원은', '두가지', '입장을', '가정해야', '한다고', '했다', '.']
[8, 2, 18, 19, 1, 1, 1, 1]
Writing example 30000 of 64800
['그는', '"팀', '공기가', '의욕적으로', '바뀌었다', '.']
[1, 1, 1, 1, 1, 1]
Writing example 35000 of 64800
['-다른', '비교했을', '때', '이색적일', '것', '같아요,', '공포스러운', '공기', '때문에', '?']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Writing example 40000 of 

In [102]:
import torch

class NaverNERDataset(torch.utils.data.Dataset):
    """ Torch Dataset for NaverNER """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v) for k, v in self.encodings[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NaverNERDataset(train_encodings, train_labels)
val_dataset = NaverNERDataset(val_encodings, val_labels)

In [103]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'input_ids': tensor([   2, 4501, 7542, 6138, 6228, 6664,  649,  517, 6897, 6079,   46, 1023,
         5532,  517, 5330, 6940, 2207, 6273, 7318,   15,  157,  553,  629, 5535,
           15, 4819,   46, 4958, 6312, 5495, 3036, 7118, 7086,  633,    3,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1]),
 'labels': tensor([-100,   12, -100, -100, -100, -100,   12,    2, -100, -100, -100,    1,
         -100,    1, -100, -100,   12, -100, -100, -100, -100,   13,   13, -100,
         -100,    1, -100,   10, -100, -100,    1, -100, -100,    1, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForTokenClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForTokenClassification.from_pretrained('monologg/ditilkobert')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(5):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()