# Bert를 사용한 다음 단어 예측



# 필요 라이브러리 설치

In [None]:
!pip install transformers==3.0.2
!pip install sentencepiece



In [None]:
!pip install konlpy



# 셋업

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from transformers import BertTokenizer
from transformers import TFBertModel

import tensorflow as tf

In [None]:
#random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

CUSTOM_VOCAB_FILE = 'custom_vocab.txt'
MAX_VOCAB_SIZE = 10000*2

BERT_MODEL_NAME = 'bert-base-multilingual-cased'
BERT_SEQUENCE_LENGTH = 128

# 데이터

## 데이터 다운로드

In [None]:
!wget https://raw.githubusercontent.com/dhrim/keras_howto_2021/master/data/alice_in_wonderland.txt

--2021-11-21 06:21:12--  https://raw.githubusercontent.com/dhrim/keras_howto_2021/master/data/alice_in_wonderland.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 172864 (169K) [text/plain]
Saving to: ‘alice_in_wonderland.txt.2’


2021-11-21 06:21:12 (10.2 MB/s) - ‘alice_in_wonderland.txt.2’ saved [172864/172864]



In [None]:
!head -n 20 alice_in_wonderland.txt

Alice’s Adventures in Wonderland

by Lewis Carroll

CHAPTER I.
Down the Rabbit-Hole


Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought Alice
“without pictures or conversations?”

So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a daisy-chain would be worth the trouble of getting up and
picking the daisies, when suddenly a White Rabbit with pink eyes ran
close by her.



In [None]:
!wc alice_in_wonderland.txt

  3710  29378 172864 alice_in_wonderland.txt


## 데이터 로딩

In [None]:
def load_raw_text_from_file(file_path):
  with open(file_path, 'r', encoding='utf-8') as fp: raw_text = fp.read()
  return raw_text

In [None]:
import collections
from collections import OrderedDict

# 파일을 읽고
raw_text = load_raw_text_from_file('alice_in_wonderland.txt')

print("len(raw_text)=", len(raw_text))

print(raw_text[:100])

len(raw_text)= 162663
Alice’s Adventures in Wonderland

by Lewis Carroll

CHAPTER I.
Down the Rabbit-Hole


Alice was begi


## token으로 짜르기

In [None]:
raw_text = raw_text.replace("\n", " ")
raw_text = raw_text.replace("  ", " ")
raw_text = raw_text.replace("“", "\"")
raw_text = raw_text.replace("”", "\"")
raw_text = raw_text.replace("’", "'")
raw_text = raw_text.replace("‘", "'")

print(raw_text[:300])

Alice's Adventures in Wonderland by Lewis Carroll CHAPTER I. Down the Rabbit-Hole  Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it


In [None]:
from konlpy.tag import Okt


BERT_SUBTOKEN_PREFIX = "##"

class KonlpyTokenizer():

  def __init__(self):
    self._tokens = []
    self._tokenizer = Okt()

  # texts = '하늘이 푸른가요? 나는 푸른색이 좋아요'
  # return ['하늘', '##이', '푸른가요', '?', '나', '##는', '푸른색', '##이', '좋아요']
  def tokenize(self, text):

    def _has_preceding_space(text, token, last_position):
      # print(text[last_position:last_position+20],"-----",token)
      return text[last_position:last_position+20].startswith(" "+token)

    poses = self._tokenizer.pos(text)

    tokens = []
    last_position = 0
    for i, pos in enumerate(poses):
      org_token = pos[0]
      token = pos[0]
      if i==0:
        pass
      elif pos[1]=="Punctuation":
        if _has_preceding_space(text, token, last_position):  # " 'of"
          last_position += 1
        else:                                                 # "습니다."
          pass
        if len(token)!=1:
          token = list(token)
      elif _has_preceding_space(text, token, last_position):
        last_position += 1
      elif pos[1] in ["Alpha", "Foreign", "Number", "URL"]:
        pass
      else:
        token = BERT_SUBTOKEN_PREFIX+token 
      if type(token)==list:
        tokens.extend(token)
      else:
        tokens.append(token)
      last_position += len(org_token)

    return tokens

# t = "CHAPTER I. Down the Rabbit-Hole  Alice was beginning to get \"very\" tired of sitting by her sister on the bank,"
# t = ", \"_ You are old , Father William _,'\" said the Caterpillar . Alice folded her hands , and began : — \" You are old , Father William ,\""
# t = "at www.gutenberg.org . If you are not located in the United States , you will have to check the laws of the country where you are located before using this"

# konlply_tokenizer = KonlpyTokenizer()
# tokenized = konlply_tokenizer.tokenize(t)
# print(tokenized)



In [None]:
konlply_tokenizer = KonlpyTokenizer()
tokenized_text = konlply_tokenizer.tokenize(raw_text)
print(raw_text[:103])
print(tokenized_text[:20])

Alice's Adventures in Wonderland by Lewis Carroll CHAPTER I. Down the Rabbit-Hole  Alice was beginning 
['Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'CHAPTER', 'I', '.', 'Down', 'the', 'Rabbit', '-', 'Hole', 'Alice', 'was', 'beginning']


## 샘플링

In [None]:
SEQUENCE_LENGTH = 30
OFFSET = 1

In [None]:
input_tokens = []
output_tokens = []
for i in range(len(tokenized_text)):
  if i+SEQUENCE_LENGTH+OFFSET >= len(tokenized_text): break
  input_tokens.append(tokenized_text[i:i+SEQUENCE_LENGTH])
  output_tokens.append(tokenized_text[i+SEQUENCE_LENGTH+OFFSET-1])

In [None]:
print(len(input_tokens))

39380


## 데이터 섞기

In [None]:
input_tokens = np.array(input_tokens, dtype=np.str)
output_tokens = np.array(output_tokens, dtype=np.str)

indexes = np.arange(input_tokens.shape[0])
np.random.shuffle(indexes)

input_tokens = input_tokens[indexes].tolist()
output_tokens = output_tokens[indexes].tolist()


## Vocab 파일 만들기

In [None]:
import collections
from collections import OrderedDict


class KonlpyVocabMaker():

  def __init__(self, texts):
    self._tokens = []
    self._tokenize(texts)

  # texts = '하늘이 푸른가요? 나는 푸른색이 좋아요'
  # return ['하늘', '##이', '푸른가요', '?', '나', '##는', '푸른색', '##이', '좋아요']
  def _tokenize(self, texts):

    konlply_tokenizer = KonlpyTokenizer()

    # 각 문장별로 토크나이징
    all_tokens = []  
    if type(texts)==str: texts = [texts]
    for text in tqdm(texts):
      all_tokens.extend(konlply_tokenizer.tokenize(text))

    # 빈도 순으로 정열
    counts = collections.Counter(all_tokens)
    sorted_tokens = sorted(all_tokens, key=counts.get, reverse=True)

    # 단어 중복 삭제
    sorted_tokens = list(OrderedDict.fromkeys(sorted_tokens))

    # Bert의 4개 특수 토큰을 삽입
    sorted_tokens.insert(0, '[PAD]')
    sorted_tokens.insert(1, '[UNK]')
    sorted_tokens.insert(2, '[CLS]')
    sorted_tokens.insert(3, '[SEP]')
    sorted_tokens.insert(4, '[MSK]')

    self._tokens = sorted_tokens

  def get_vocab(self):
    return self._tokens


# t = "CHAPTER I. Down the Rabbit-Hole  Alice was beginning to get \"very\" tired of sitting by her sister on the bank,"
# konlply_tokenizer = KonlpyTokenizer()
# tokenized = konlply_tokenizer.tokenize(t)
# print(tokenized)

t = ", \"_ You are old , Father William _,'\" said the Caterpillar . Alice folded her hands , and began : — \" You are old , Father William ,\""
konlply_vocab_maker = KonlpyVocabMaker(t)
v = konlply_vocab_maker.get_vocab()
print(v)

100%|██████████| 1/1 [00:00<00:00, 830.72it/s]

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MSK]', ',', '"', '_', 'You', 'are', 'old', 'Father', 'William', "'", 'said', 'the', 'Caterpillar', '.', 'Alice', 'folded', 'her', 'hands', 'and', 'began', ':', '—']





### vocab 만들기 실행

In [None]:
konlply_vocab_maker = KonlpyVocabMaker(raw_text)

vocab = konlply_vocab_maker.get_vocab()

100%|██████████| 1/1 [00:01<00:00,  1.24s/it]


In [None]:
print("org vocab size =",len(vocab))
vocab = vocab[:MAX_VOCAB_SIZE]
vocab_size = len(vocab)
print("vocab_size = ", len(vocab))

org vocab size = 3412
vocab_size =  3412


In [None]:
print(vocab[:20])

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MSK]', ',', '"', 'the', '.', 'and', 'to', "'", 'a', 'of', 'I', 'it', 'she', 'said', '!', '_']


### vocab 파일 저장

In [None]:
with open(CUSTOM_VOCAB_FILE, 'w') as f:
  for item in vocab:
    f.write("%s\n" % item)

In [None]:
!wc {CUSTOM_VOCAB_FILE}

 3412  3412 24370 custom_vocab.txt


## Bert Tokenizer 생성

In [None]:
bert_tokenizer = BertTokenizer(vocab_file=CUSTOM_VOCAB_FILE, do_lower_case=False, model_max_length=SEQ_LENGTH)

In [None]:
bert_tokenized = bert_tokenizer(input_tokens[100], max_length=BERT_SEQUENCE_LENGTH, padding='max_length', is_pretokenized=True, )
print("vocab              :", vocab[:20])
print("original sentence  :", input_tokens[100])
print("tokens             :", bert_tokenizer.convert_ids_to_tokens(bert_tokenized['input_ids']))
print("token id           :", bert_tokenized['input_ids'])
print("attention mask     :", bert_tokenized['attention_mask'])
print("token type         :", bert_tokenized['token_type_ids'])

vocab              : ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MSK]', ',', '"', 'the', '.', 'and', 'to', "'", 'a', 'of', 'I', 'it', 'she', 'said', '!', '_']
original sentence  : ['I', 'don', "'", 't', 'like', 'them', 'raw', '.', '"', '"', 'Well', ',', 'be', 'off', ',', 'then', '!', '"', 'said', 'the', 'Pigeon', 'in', 'a', 'sulky', 'tone', ',', 'as', 'it', 'settled', 'down']
tokens             : ['[CLS]', 'I', 'don', "'", 't', 'like', 'them', 'raw', '.', '"', '"', 'Well', ',', 'be', 'off', ',', 'then', '!', '"', 'said', 'the', 'Pigeon', 'in', 'a', 'sulky', 'tone', ',', 'as', 'it', 'settled', 'down', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

![bert_input_architecture](https://user-images.githubusercontent.com/1250095/50039788-8e4e8a00-007b-11e9-9747-8e29fbbea0b3.png)

## x, y 생성


tokenizer 사용 중에 경고 메시지가 많이 뜬다. 억제한다.


In [None]:
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
from random import randrange

def build_model_input_output(input_tokens, output_tokens):
  input_ids = []
  attention_masks = []
  token_type_ids = []
  labels = []

  for input_token, output_token in zip(input_tokens, output_tokens):
    bert_tokenized = bert_tokenizer(input_token, max_length=BERT_SEQUENCE_LENGTH, padding='max_length', is_pretokenized=True)
    # bert_tokenized = {'input_ids': [101, ...], 'token_type_ids': [0, ...], 'attention_mask': [1, ...]}
    input_ids.append(bert_tokenized['input_ids'][:BERT_SEQUENCE_LENGTH]) # 버그인지 몰라도 max_length 이상이어도 더 크게 나온다.
    attention_masks.append(bert_tokenized['attention_mask'][:BERT_SEQUENCE_LENGTH])
    token_type_ids.append(bert_tokenized['token_type_ids'][:BERT_SEQUENCE_LENGTH])
    labels.append(vocab.index(output_token))

  return (np.array(input_ids), np.array(attention_masks), np.array(token_type_ids)), np.array(labels)

# MAX_DATA_COUNT = 9
# x, y = build_model_input_output(input_tokens[:MAX_DATA_COUNT], output_tokens[:MAX_DATA_COUNT])  


In [None]:
MAX_DATA_COUNT = 10000 * 10
x, y = build_model_input_output(input_tokens[:MAX_DATA_COUNT], output_tokens[:MAX_DATA_COUNT])  

## train/test 분리

In [None]:
def split_bert_data(x, y, test_ratio):
  split_index = int(len(y)*(1-test_ratio))
  train_x = (x[0][:split_index], x[1][:split_index], x[2][:split_index])
  test_x  = (x[0][split_index:], x[1][split_index:], x[2][split_index:])
  train_y, test_y = y[:split_index], y[split_index:]

  return (train_x, train_y), (test_x, test_y)

(train_x, train_y), (test_x, test_y) = split_bert_data(x, y, test_ratio=0.2)

In [None]:
print(bert_tokenizer.decode(train_x[0][0][:35]))
print(train_x[0][0][:35])
print(train_x[1][0][:35])
print(train_x[2][0][:35])

[CLS] and she told her sister, as well as she could remember them, all these strange Adventures of hers that you have just been reading about ; and when [SEP] [PAD] [PAD] [PAD]
[   2    9   16  686   27  459    5   26  144   26   16   73  319   59
    5   36  308  752 1006   13  969   24   21   67  131  154  833   54
   37    9   83    3    0    0    0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# 학습

## 모델 생성

In [None]:
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.layers import Dense, Dropout

class TFBertClassifier(tf.keras.Model):
  def __init__(self):
    super(TFBertClassifier, self).__init__()

    self.bert = TFBertModel.from_pretrained(BERT_MODEL_NAME, trainable=True)
    self.dropout = Dropout(self.bert.config.hidden_dropout_prob)
    self.classifier = Dense(vocab_size, kernel_initializer=TruncatedNormal(self.bert.config.initializer_range), 
                            name="classifier", activation="softmax")

  def call(self, inputs, training=True):

    # outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
    outputs = self.bert(inputs)
    # outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)
    pooled_output = outputs[1]
    # pooled_output = outputs[1] 
    v = self.dropout(pooled_output, training=training)
    out = self.classifier(v)

    return out


참고로 Bert의 default 설정은 다음과 같다.

In [None]:
print(model.bert.config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 119547
}



In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

optimizer = Adam(3e-5)
loss = SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])


## 학습 실행

In [None]:
print(train_y.shape)

(31504,)


In [None]:
history = model.fit(train_x, train_y, epochs=1, batch_size=32, validation_split=0.1)



In [None]:
loss, acc = model.evaluate(test_x, test_y, batch_size=32)
print("loss =", loss)
print("acc =", acc)

loss = 5.638205528259277
acc = 0.1342051774263382


## 분류 실행

In [None]:
def do_predict(test_input, test_output):
  y_ = model.predict(test_input)
  predicted = y_[0].argsort()[-5:][::-1]
  org_input = bert_tokenizer.decode(test_input[0][0][:35])
  org_input = org_input.replace("[CLS]", "")
  org_input = org_input.replace("[SEP]", "")
  org_input = org_input.replace("[PAD]", "")
  org_input = org_input.replace("[PAD]", "")
  org_input = org_input.replace(" ", " ")
  print( org_input, ", --> TRUTH :", vocab[test_output],  ", EXPECTED :", [vocab[i] for i in predicted])

for i in range(5):
  do_predict((test_x[0][i:i+1], test_x[1][i:i+1], test_x[2][i:i+1]), test_y[i])


 into a conversation. " You don't know much, " said the Duchess ; " and that's a fact. " Alice did not at     , --> TRUTH : all , EXPECTED : [',', 'the', "'", '.', 'a']
 distributing, performing, displaying or creating derivative works based on the work as long as all references to Project Gutenberg are removed. Of course, we hope that     , --> TRUTH : you , EXPECTED : [',', 'the', '.', 'a', "'"]
 the King had said that day. " That _ proves _ his guilt, " said the Queen. " It proves nothing of the sort! " said     , --> TRUTH : Alice , EXPECTED : ['the', ',', "'", 'a', 'you']
 without speaking, but at last it unfolded its arms, took the hookah out of its mouth again, and said, " So you think you're     , --> TRUTH : changed , EXPECTED : ['the', 'a', "'", 'she', 'be']
 s making personal remarks now? " the Hatter asked triumphantly. Alice did not quite know what to say to this : so she helped herself to some tea     , --> TRUTH : and , EXPECTED : [',', 'of', '.', 'to', '!']
