In [1]:
!pip install keras==2.3.1
!pip install torch==1.1.0
!pip install transformers==2.5.1
!pip install seqeval

Collecting keras==2.3.1
  Downloading Keras-2.3.1-py2.py3-none-any.whl (377 kB)
[?25l[K     |▉                               | 10 kB 26.1 MB/s eta 0:00:01[K     |█▊                              | 20 kB 18.4 MB/s eta 0:00:01[K     |██▋                             | 30 kB 10.5 MB/s eta 0:00:01[K     |███▌                            | 40 kB 8.6 MB/s eta 0:00:01[K     |████▍                           | 51 kB 5.1 MB/s eta 0:00:01[K     |█████▏                          | 61 kB 5.6 MB/s eta 0:00:01[K     |██████                          | 71 kB 5.4 MB/s eta 0:00:01[K     |███████                         | 81 kB 6.0 MB/s eta 0:00:01[K     |███████▉                        | 92 kB 4.7 MB/s eta 0:00:01[K     |████████▊                       | 102 kB 5.0 MB/s eta 0:00:01[K     |█████████▌                      | 112 kB 5.0 MB/s eta 0:00:01[K     |██████████▍                     | 122 kB 5.0 MB/s eta 0:00:01[K     |███████████▎                    | 133 kB 5.0 MB/s eta 0:00

In [2]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F

In [3]:
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

In [4]:
!pip list | grep -E 'transformers|torch|Keras'

Keras                         2.3.1
Keras-Applications            1.0.8
Keras-Preprocessing           1.1.2
torch                         1.1.0
torchsummary                  1.5.1
torchtext                     0.10.0
torchvision                   0.10.0+cu111
transformers                  2.5.1


## 데이터 불러오기

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data_path = "/content/drive/My Drive/ner_dataset.csv"
df_data = pd.read_csv(data_path, sep=",", encoding="latin1").fillna(method='ffill')

ner_dataset.csv


![image](https://user-images.githubusercontent.com/44194558/137710745-2c2888e0-7d77-47fb-bd35-df90c956c74f.png)

In [7]:
df_data.columns

Index(['Sentence #', 'Word', 'POS', 'Tag'], dtype='object')

In [8]:
df_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [9]:
# POS 종류
df_data.POS.unique()

array(['NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC',
       'JJ', '.', 'VBD', 'WP', '``', 'CD', 'PRP', 'VBZ', 'POS', 'VBG',
       'RB', ',', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', ':', 'JJS', 'WP$',
       'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'LRB', 'RRB', '$', 'RBR', ';',
       'UH', 'FW'], dtype=object)

In [10]:
# Tag 종류
df_data.Tag.unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

* B : Begin. 객체 명이 시작되는 부분
* I : Inside. 객체 명의 내부 부분
* O : Outside. 객체 명이 아닌 부분
* time : time
* per : person. 인명
* geo : geography. 지역명


In [11]:
# 태그 분포
df_data.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

#### 데이터 개요

In [12]:
print('총 문장의 수 :', df_data['Sentence #'].nunique())
print('총 단어의 수 :', df_data.Word.nunique())
print('POS 종류 :', df_data.POS.nunique())
print('Tag 종류 :', df_data.Tag.nunique())

총 문장의 수 : 47959
총 단어의 수 : 35178
POS 종류 : 42
Tag 종류 : 17


## 전처리

### 데이터를 문서 구조로 parsing

In [13]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                           s['POS'].values.tolist(),
                                                           s['Tag'].values.tolist())]
  
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)  # 데이터를 문장 단위로 구분, 각 행(단어) 마다 agg_func 적용
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s

        except:
          return None

#### SentenceGetter 보충 

In [14]:
# self.grouped의 한 예 (첫번 째 문장)
data = df_data.loc[df_data['Sentence #'] == 'Sentence: 1']
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [15]:
# self.sentences의 한 예 (첫번 째 문장에 agg_func 적용)
# (단어, POS, Tag) 튜플로 구성
[(w, p, t) for w, p, t in zip(data['Word'].values.tolist(),
                              data['POS'].values.tolist(),
                              data['Tag'].values.tolist())]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [16]:
# 전체 데이터(모든 문장)에 적용
getter = SentenceGetter(df_data)
getter

<__main__.SentenceGetter at 0x7f604a945e10>

In [17]:
# getter.sentences는 모든 문장들에 대한 (w, p, t) 튜플들로 구성. 
# sent는 두칸 위 셀의 결과와 같은 형식으로 표현됨. s는 sent를 구성하는 (w, p, t) 튜플
sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [18]:
poses = [[s[1] for s in sent] for sent in getter.sentences]
print(poses[0])

['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']


In [19]:
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


#### Tag를 index로 전환

In [20]:
# 모든 태그 종류
tags_vals = list(set(df_data['Tag'].values))
tags_vals

['I-art',
 'B-tim',
 'I-per',
 'B-nat',
 'I-tim',
 'B-eve',
 'B-gpe',
 'B-org',
 'I-org',
 'I-geo',
 'O',
 'B-geo',
 'B-per',
 'B-art',
 'I-gpe',
 'I-nat',
 'I-eve']

In [21]:
tags_vals.append('X')  # wordpiece용
tags_vals.append('[CLS]')  # BERT 구분자
tags_vals.append('[SEP]')  # BERT 구분자

In [22]:
tags_vals = set(tags_vals)

In [23]:
tags_vals

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O',
 'X',
 '[CLS]',
 '[SEP]'}

In [24]:
# tag를 index로 매핑하는 딕셔너리
# tag2idx = {t: i for i, t in enumerate(tags_vals)}

tag2idx={'B-art': 14,
         'B-eve': 16,
         'B-geo': 0,
         'B-gpe': 13,
         'B-nat': 12,
         'B-org': 10,
         'B-per': 4,
         'B-tim': 2,
         'I-art': 5,
         'I-eve': 7,
         'I-geo': 15,
         'I-gpe': 8,
         'I-nat': 11,
         'I-org': 3,
         'I-per': 6,
         'I-tim': 1,
         'X':17,
         'O': 9,
         '[CLS]':18,
         '[SEP]':19}

In [25]:
# index를 tag로 매핑
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [26]:
tag2name

{0: 'B-geo',
 1: 'I-tim',
 2: 'B-tim',
 3: 'I-org',
 4: 'B-per',
 5: 'I-art',
 6: 'I-per',
 7: 'I-eve',
 8: 'I-gpe',
 9: 'O',
 10: 'B-org',
 11: 'I-nat',
 12: 'B-nat',
 13: 'B-gpe',
 14: 'B-art',
 15: 'I-geo',
 16: 'B-eve',
 17: 'X',
 18: '[CLS]',
 19: '[SEP]'}

## 학습 데이터 구축

* 사전 학습된 Tokenizer 활용
* 입력 임베딩 구축 : token embedding, mask word embedding, segmentation embedding
* 학습, 검증, 테스트 데이터를 dataloader에 전달

In [27]:
# gpu 환경 구축
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

### Tokenize

사전 학습된 Tokenizer 다운로드 : https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt

In [28]:
# tokenizer 불러오기
vocabulary = "/content/drive/My Drive/vocab.txt"

In [29]:
# 모델의 max_position_embedding인 512보다 작아야함
max_len  = 45

In [30]:
# tokenizer 로드
tokenizer = BertTokenizer(vocab_file=vocabulary, do_lower_case=False)  # 대소문자를 구분함
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x7f6042174890>

In [47]:
tokenized_texts = []
word_piece_labels = []
i_inc = 0

# word_list : 문장을 구성하는 단어들의 리스트
# label : 해당 단어들의 tag label
for word_list, label in (zip(sentences, labels)):  
    temp_label = []
    temp_token = []
    
    # 문장 시작에 [CLS] 토큰 추가
    temp_label.append('[CLS]')
    temp_token.append('[CLS]')
    
    # Tokenize : 문장을 구성하는 모든 단어들을 사전 학습된 Tokenize를 활용하여 토큰화
    for word, lab in zip(word_list, label):
        token_list = tokenizer.tokenize(word)
        for m, token in enumerate(token_list):
            temp_token.append(token)
            if m == 0:
                temp_label.append(lab)

            # ##tra, ##A 같은 형식은 X로 라벨링    
            else:
                temp_label.append('X')

    # 문장 마지막 부분에 [SEP] 토큰 추가
    temp_label.append('[SEP]')
    temp_token.append('[SEP]')

    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_label)

    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_label)))
        print("lables:%s"%(" ".join(temp_label)))
        print()

    i_inc +=1

No.0,len:28
texts:[CLS] Thousands of demons ##tra ##tors have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country . [SEP]
No.0,len:28
lables:[CLS] O O O X X O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O [SEP]

No.1,len:29
texts:[CLS] Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an I ##A ##EA surveillance system begins functioning . [SEP]
No.1,len:29
lables:[CLS] B-gpe O O O O O O O O O O O O O O B-tim O O O B-org X X O O O O O [SEP]

No.2,len:44
texts:[CLS] He ##lic ##op ##ter guns ##hips Saturday pounded militant hide ##outs in the Or ##ak ##zai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South W ##azi ##rist ##an . [SEP]
No.2,len:44
lables:[CLS] O X X X O X B-tim O O O X O O B-geo X X O O O O O B-org O O O O O O O O O O O O O O B-geo I-geo X X X O [SEP]

No.3,len:16
texts:[CLS] They 

#### Tokenize 보충

In [32]:
# 첫 번째 문장
word_list = sentences[0]
label = labels[0]

temp_label = [] ; temp_token = []
temp_label.append('[CLS]') ; temp_token.append('[CLS]')

# 초기 상태
print(temp_label)
print(temp_token)

['[CLS]']
['[CLS]']


In [33]:
# 첫 번째 문장의 단어, tag 라벨들
print(word_list)
print(label)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [40]:
# 첫 번째 문장의 단어들을 사전학습된 토크나이저로 토큰화
for word, lab in zip(word_list, label):
    token_list = tokenizer.tokenize(word)
    print('단어 토큰 :', token_list, '/', 'Tag 라벨 :', lab)

단어 토큰 : ['Thousands'] / Tag 라벨 : O
단어 토큰 : ['of'] / Tag 라벨 : O
단어 토큰 : ['demons', '##tra', '##tors'] / Tag 라벨 : O
단어 토큰 : ['have'] / Tag 라벨 : O
단어 토큰 : ['marched'] / Tag 라벨 : O
단어 토큰 : ['through'] / Tag 라벨 : O
단어 토큰 : ['London'] / Tag 라벨 : B-geo
단어 토큰 : ['to'] / Tag 라벨 : O
단어 토큰 : ['protest'] / Tag 라벨 : O
단어 토큰 : ['the'] / Tag 라벨 : O
단어 토큰 : ['war'] / Tag 라벨 : O
단어 토큰 : ['in'] / Tag 라벨 : O
단어 토큰 : ['Iraq'] / Tag 라벨 : B-geo
단어 토큰 : ['and'] / Tag 라벨 : O
단어 토큰 : ['demand'] / Tag 라벨 : O
단어 토큰 : ['the'] / Tag 라벨 : O
단어 토큰 : ['withdrawal'] / Tag 라벨 : O
단어 토큰 : ['of'] / Tag 라벨 : O
단어 토큰 : ['British'] / Tag 라벨 : B-gpe
단어 토큰 : ['troops'] / Tag 라벨 : O
단어 토큰 : ['from'] / Tag 라벨 : O
단어 토큰 : ['that'] / Tag 라벨 : O
단어 토큰 : ['country'] / Tag 라벨 : O
단어 토큰 : ['.'] / Tag 라벨 : O


In [35]:
for word, lab in zip(word_list, label):
    token_list = tokenizer.tokenize(word)
    for m, token in enumerate(token_list):
        temp_token.append(token)
        if m == 0:  # 위 결과의 개별 리스트에서 첫 번째 토큰들
            temp_label.append(lab)
        else:  # 위 결과의 개별 리스트에서 첫 번째 가 아닌 토큰들 ex) ##tra
            temp_label.append('X')

print(temp_label)
print(temp_token)

['[CLS]', 'O', 'O', 'O', 'X', 'X', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
['[CLS]', 'Thousands', 'of', 'demons', '##tra', '##tors', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']


### Token Embedding

In [48]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],  # convert_tokens_to_ids : 토큰 인덱스들의 리스트를 반환
                           maxlen=max_len, dtype="long", truncating="post", padding="post")

print(input_ids[0])
# 시퀀스 패딩으로 길이 동일
print(len(input_ids[0]))
print(len(input_ids[1]))
print(len(input_ids[2]))

[  101 26159  1104  8568  4487  5067  1138  9639  1194  1498  1106  5641
  1103  1594  1107  5008  1105  4555  1103 10602  1104  1418  2830  1121
  1115  1583   119   102     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0]
45
45
45


In [42]:
print(tokenizer.convert_tokens_to_ids(tokenized_texts[0]))

[101, 26159, 1104, 8568, 4487, 5067, 1138, 9639, 1194, 1498, 1106, 5641, 1103, 1594, 1107, 5008, 1105, 4555, 1103, 10602, 1104, 1418, 2830, 1121, 1115, 1583, 119, 102]


Tokenizer 기능, 옵션 참고 : https://huggingface.co/transformers/main_classes/tokenizer.html

In [43]:
# Tag 라벨을 인덱스로 변환
# 동일한 길이로 패딩 ('O' : Others)
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                       maxlen=max_len, value=tag2idx["O"], padding="post", dtype="long", truncating="post")

print(tags[0])

[18  9  9  9 17 17  9  9  9  0  9  9  9  9  9  0  9  9  9  9  9 13  9  9
  9  9  9 19  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9]


In [45]:
txt = tokenized_texts[1]
print(txt)

['[CLS]', 'Iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'Wednesday', ',', 'after', 'an', 'I', '##A', '##EA', 'surveillance', 'system', 'begins', 'functioning', '.', '[SEP]']


### Mask Word Embedding

In [49]:
# 0보다 큰 경우 (실제 단어 토큰인 경우) 1, 0인 경우(패딩값) 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])  # 28번째 까지가 1

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [41]:
ii = input_ids[0]  # 28번째 까지가 실제 단어 토큰의 인덱스
ii

array([  101, 26159,  1104,  8568,  4487,  5067,  1138,  9639,  1194,
        1498,  1106,  5641,  1103,  1594,  1107,  5008,  1105,  4555,
        1103, 10602,  1104,  1418,  2830,  1121,  1115,  1583,   119,
         102,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0])

### Segment Embedding

In [50]:
# 한 문장이기 때문에 0으로만 임베딩됨
# 문장 태깅 태스크에서는 불필요한 작업
segment_ids = [[0] * len(input_id) for input_id in input_ids]
print(segment_ids[0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## 데이터 분할 & 변환

In [43]:
tags

array([[18,  9,  9, ...,  9,  9,  9],
       [18, 13,  9, ...,  9,  9,  9],
       [18,  9, 17, ...,  9, 19,  9],
       ...,
       [18,  9,  0, ...,  9,  9,  9],
       [18,  9,  9, ...,  9,  9,  9],
       [18,  9, 10, ...,  9,  9,  9]])

In [52]:
# 토큰화가 완료된 전체 텍스트 데이터는 47959개의 문장이 있음
# 개별 문장들은 동일한 길이 45를 갖는 토큰들의 시퀀스로 표현되고, 개별 토큰들에 대응되는 tag가 존재
print(tags.shape)
print(len(tokenized_texts))

(47959, 45)
47959


In [56]:
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks,tr_segs, val_segs = train_test_split(input_ids, tags, attention_masks, segment_ids, 
                                                                                                   random_state=4, test_size=0.3)

len(tr_inputs), len(val_inputs), len(tr_segs), len(val_segs)

(33571, 14388, 33571, 14388)

In [57]:
tr_inputs # array

array([[  101,   138,  5680, ...,     0,     0,     0],
       [  101, 14381,  1555, ...,     0,     0,     0],
       [  101,  1828,   119, ...,     0,     0,     0],
       ...,
       [  101,  1124,  1145, ...,     0,     0,     0],
       [  101, 13364,   112, ...,     0,     0,     0],
       [  101,  1109,  3948, ...,  3906,  3702,  2410]])

In [54]:
# 텐서로 변환
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

In [55]:
tr_inputs # tensor

tensor([[  101,   138,  5680,  ...,     0,     0,     0],
        [  101, 14381,  1555,  ...,     0,     0,     0],
        [  101,  1828,   119,  ...,     0,     0,     0],
        ...,
        [  101,  1124,  1145,  ...,     0,     0,     0],
        [  101, 13364,   112,  ...,     0,     0,     0],
        [  101,  1109,  3948,  ...,  3906,  3702,  2410]])

In [64]:
batch_num = 32

In [47]:
# 토큰 임베딩, 어텐션 임베딩만 포함
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)  # 각 데이터를 랜덤하게 가져옴
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num, drop_last=True)
# batch 단위로 데이터를 불러올 경우, batch_size에 따라 마지막 batch의 길이가 달라질 수 있음. 
# ex) data의 개수는 27개인데, batch_size가 5라면 마지막 batch의 크기는 2가 됨.
# 이 때 마지막 배치를 사용하지 않음 (배치 크기에 의존도가 높은 손실 함수를 사용하는 경우 권장됨)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## 학습

pytorch_model.bin 다운로드 : https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin

config.json 다운로드 : https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json

In [69]:
# 사전 학습된 모델 불러오기
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(tag2idx))

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [49]:
# 모델에 GPT 세팅
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine

In [50]:
# GPU가 여러개면 
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [66]:
epochs = 5
max_grad_norm = 1.0
# train optimization num
num_train_optimization_steps = int(math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

### Fine-Tuning

In [52]:
# True : 모든 layer를 미세 조정
# False : 분류기 레이어만 미세 조정
FULL_FINETUNING = True

In [53]:
if FULL_FINETUNING:
    # 모든 레이어의 파라미터들을 미세 조정
    param_optimizer = list(model.named_parameters())  # (name, parameter) 조합의 tuple iterator 반환
    no_decay = ['bias', 'gamma', 'beta']  # 가중치 감쇠를 적용하지 않는 파라미터들
    optimizer_grouped_parameters = [
        # 가중치 감쇠 적용                            
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],  # n : 파라미터 이름 / p: 실제 가중치 행렬
         'weight_decay_rate': 0.01},
        # 가중치 감쇠 미적용
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]

else:
    # 분류기 레이어의 파라미터만 미세조정
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

In [78]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
[n for n, p in param_optimizer][:4]

['bert.embeddings.word_embeddings.weight',
 'bert.embeddings.position_embeddings.weight',
 'bert.embeddings.token_type_embeddings.weight',
 'bert.embeddings.LayerNorm.weight']

In [80]:
# 가중치 감쇠 적용
n = 'bert.embeddings.word_embeddings.weight' 
not any(nd in n for nd in no_decay)

True

In [82]:
# 가중치 감쇠 미적용
n = 'bias' 
any(nd in n for nd in no_decay)

True

weight_decay(가중치 감쇠) : 과적합 문제를 해결하기 위한 규제화. 학습 중 가중치가 너무 큰 값을 가지지 않도록 제한하여 모델의 복잡도를 감소시킴.

torch에서 파라미터 접근하기 참고 : https://soundprovider.tistory.com/entry/pytorch-torch%EC%97%90%EC%84%9C-parameter-%EC%A0%91%EA%B7%BC%ED%95%98%EA%B8%B0

weight_decay 참고 

https://deepapple.tistory.com/6

https://ko.d2l.ai/chapter_deep-learning-basics/weight-decay.html

In [54]:
model.train();

In [55]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # batch에 gpu 할당
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch  # 토큰 임베딩, 마스크 임베딩, tag 라벨
        
        # forward 
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # 손실 함수 계산
        loss, scores = outputs[:2]
        if n_gpu>1:
            # gpu를 여러 개 사용하는 경우 평균
            loss = loss.mean()

        # backward : 모든 파라미터에 대해 loss의 변화도(gradient) 계산
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()  # 손실이 가지고 있는 스칼라 값
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # 파라미터 업데이트
        optimizer.step()  # optimizer가 업데이트할 파라미터와 학습률, 하이퍼 파라미터 등을 받아 업데이트 (argument로 전달받은 파라미터 업데이트)
        optimizer.zero_grad()  # 한 번의 학습이 완료되면 gradient를 0으로 만듬. (loss.backward가 매번 gradient를 더하기 때문) 
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

***** Running training *****
  Num examples = 33571
  Batch size = 32
  Num steps = 5250


Epoch:  20%|██        | 1/5 [03:30<14:00, 210.14s/it]

Train loss: 0.14344811097348553


Epoch:  40%|████      | 2/5 [06:58<10:28, 209.35s/it]

Train loss: 0.067771936517326


Epoch:  60%|██████    | 3/5 [10:28<06:58, 209.29s/it]

Train loss: 0.05264862299185633


Epoch:  80%|████████  | 4/5 [13:56<03:29, 209.01s/it]

Train loss: 0.039890202472574146


Epoch: 100%|██████████| 5/5 [17:25<00:00, 209.12s/it]

Train loss: 0.030512325715156427





참고 : 

https://anweh.tistory.com/22

https://tutorials.pytorch.kr/beginner/pytorch_with_examples.html

https://algopoolja.tistory.com/55

https://velog.io/@kjb0531/zerograd%EC%9D%98-%EC%9D%B4%ED%95%B4

### 모델 저장 & 로드

In [56]:
bert_out_address = '/content/drive/My Drive/models/bert_out_model/en09'
# 해당 경로가 없으면 만듬
if not os.path.exists(bert_out_address):
    os.makedirs(bert_out_address)

In [57]:
model_to_save = model.module if hasattr(model, 'module') else model  
# from_pretrained를 사용하여 학습한 모델을 불러올 수 있도록
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "config.json")

In [58]:
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

Saving vocabulary to /content/drive/My Drive/models/bert_out_model/en09/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to /content/drive/My Drive/models/bert_out_model/en09/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to /content/drive/My Drive/models/bert_out_model/en09/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to /content/drive/My Drive/models/bert_out_model/en09/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to /content/drive/My Drive/models/bert_out_model/en09/vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to /content/drive/My Drive/models/bert_out_model/en09/vocab.txt: vocabulary indices are not consecutive. Please 

('/content/drive/My Drive/models/bert_out_model/en09/vocab.txt',)

In [59]:
# from_pretrained를 사용하여 모델 불러오기
model = BertForTokenClassification.from_pretrained(bert_out_address, num_labels=len(tag2idx))

In [60]:
model.cuda();

In [61]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## 평가

In [62]:
model.eval();

In [63]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    # 학습된 파라미터 값을 평가하는 단계에서는 gradient를 계산할 필요가 없음 (메모리 사용량 줄이기)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
                        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
    logits = logits.detach().cpu().numpy()
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()

    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
                   
        y_true.append(temp_1)
        y_pred.append(temp_2)

        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)

# Save the report into file
output_eval_file = os.path.join(bert_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("f1 socre: %f"%(f1_score(y_true, y_pred)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("f1 socre:\n")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =14388
  Batch size = 32




f1 socre: 0.835981
Accuracy score: 0.971437


  _warn_prf(average, modifier, msg_start, len(result))


***** Eval results *****

              precision    recall  f1-score   support

           _     0.0000    0.0000    0.0000         0
         art     0.2190    0.1756    0.1949       131
         eve     0.3608    0.3846    0.3723        91
         geo     0.8599    0.8864    0.8730     11066
         gpe     0.9648    0.9418    0.9532      4830
         nat     0.5000    0.4490    0.4731        49
         org     0.7014    0.7151    0.7082      5954
         per     0.7687    0.7999    0.7840      5123
         tim     0.8708    0.8695    0.8702      6016

   micro avg     0.8295    0.8426    0.8360     33260
   macro avg     0.5828    0.5802    0.5810     33260
weighted avg     0.8303    0.8426    0.8363     33260

f1 socre: 0.835981
Accuracy score: 0.971437


참고 

https://ctkim.tistory.com/147

## Inference

* 학습 완료된 모델, Tokenizer 불러오기
* Test query 작성
* Test query를 임베딩
* 학습 완료된 모델을 사용하여 예측

In [64]:
tag2idx={'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X':17,
 'O': 9,
 '[CLS]':18,
 '[SEP]':19}

In [65]:
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [66]:
save_model_address = '/content/drive/My Drive/models/bert_out_model/en09'

In [67]:
save_model = BertForTokenClassification.from_pretrained(save_model_address,num_labels=len(tag2idx))

In [68]:
tokenizer = BertTokenizer.from_pretrained(save_model_address,do_lower_case=False)

In [69]:
max_len  = 45

In [70]:
test_query = "I live in USA, this is my IBM laptop."

tokenized_texts = []

temp_token = []
temp_token.append('[CLS]')

token_list = tokenizer.tokenize(test_query)
for m,token in enumerate(token_list):
    temp_token.append(token)

# max_len을 만족시킬 수 있도록 Trim
if len(temp_token) > max_len-1:
    temp_token= temp_token[:max_len-1]

temp_token.append('[SEP]')

tokenized_texts.append(temp_token)

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                           maxlen=max_len, dtype="long", truncating="post", padding="post")

attention_masks = [[int(i>0) for i in ii] for ii in input_ids]

segment_ids = [[0] * len(input_id) for input_id in input_ids]


input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
segment_ids = torch.tensor(segment_ids)

In [71]:
save_model.eval();

In [72]:
# Get model predict result
with torch.no_grad():
    outputs = save_model(input_ids, token_type_ids=None,
                         attention_mask=None,)
    # For eval mode, the first result of outputs is logits
    logits = outputs[0]

In [73]:
predict_results = logits.detach().cpu().numpy()

In [74]:
predict_results.shape  # 45개(maxlen) 단어들에 대한 태그(길이 20을 갖는) 예측 결과

(1, 45, 20)

In [75]:
from scipy.special import softmax

result_arrays_soft = softmax(predict_results[0])
print(result_arrays_soft[0])

result_array = result_arrays_soft
print(len(result_array))
print(len(result_array[0]))

[1.6366083e-06 8.4673241e-07 1.3927450e-06 4.0952997e-07 8.1400731e-07
 5.3468381e-07 4.0680285e-07 3.9700802e-07 2.6337506e-07 4.7035282e-06
 4.3789660e-07 1.4495798e-06 1.0580754e-06 8.6076642e-07 8.1541128e-07
 5.6287877e-07 4.8961897e-07 6.5683111e-07 5.0704867e-02 8.4139873e-07]
45
20


In [76]:
result_list = np.argmax(result_array,axis=-1)
print(result_list)

[18  9  9  9  9  9  9 17  9 17  9  9 19  9 17 17 17 17 17 17 17 17 17 17
 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17]


In [77]:
for i, mark in enumerate(attention_masks[0]):
    if mark>0:
        print("Token:%s"%(temp_token[i]))
#         print("Tag:%s"%(result_list[i]))
        print("Predict_Tag:%s"%(tag2name[result_list[i]]))
        #print("Posibility:%f"%(result_array[i][result_list[i]]))
        print()

Token:[CLS]
Predict_Tag:[CLS]

Token:I
Predict_Tag:O

Token:live
Predict_Tag:O

Token:in
Predict_Tag:O

Token:USA
Predict_Tag:O

Token:,
Predict_Tag:O

Token:this
Predict_Tag:O

Token:is
Predict_Tag:X

Token:my
Predict_Tag:O

Token:IBM
Predict_Tag:X

Token:laptop
Predict_Tag:O

Token:.
Predict_Tag:O

Token:[SEP]
Predict_Tag:[SEP]

