In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.5 MB/s[0m eta [36m0:00:0

In [None]:
# 참고자료 : https://github.com/MrBananaHuman/KorNlpTutorial/blob/main/1_%ED%95%9C%EA%B5%AD%EC%96%B4_tokenizing.ipynb

In [3]:
import pandas as pd
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Bert-base의 토크나이저

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
result = tokenizer.tokenize('Here is the sentence I want embeddings for.')  #  #이 붙으면 중간에 있는 단어라는 것 표현
print(result)

['here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.']


In [7]:
print(tokenizer.vocab['here'])

2182


In [8]:
# BERT의 단어 집합을 vocabulary.txt에 저장
with open('vocabulary.txt', 'w') as f:
  for token in tokenizer.vocab.keys():
    f.write(token + '\n')

In [9]:
df = pd.read_fwf('vocabulary.txt', header=None)
df

Unnamed: 0,0
0,[PAD]
1,[unused0]
2,[unused1]
3,[unused2]
4,[unused3]
...,...
30517,##．
30518,##／
30519,##：
30520,##？


In [None]:
# MLM (Masked Language Model)

In [1]:
# 영어 bert

In [11]:
from transformers import TFBertForMaskedLM
from transformers import AutoTokenizer

In [12]:
model = TFBertForMaskedLM.from_pretrained('bert-large-uncased')
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
inputs = tokenizer('Soccer is a really fun [MASK].', return_tensors='tf') # 텐서 형태로 변환

In [14]:
print(inputs['input_ids']) # 문장을 정수 인덱스로 변환

tf.Tensor([[ 101 4715 2003 1037 2428 4569  103 1012  102]], shape=(1, 9), dtype=int32)


In [15]:
print(inputs['token_type_ids'])  # 문장 내에서 서로 다른 두 개의 문장 구분. 한 문장에서는 모두 0으로 설정

tf.Tensor([[0 0 0 0 0 0 0 0 0]], shape=(1, 9), dtype=int32)


In [16]:
print(inputs['attention_mask']) # 패딩하기 이전이기 때문에 수치가 모두 들어있으므로 1 로 가득 차있음

tf.Tensor([[1 1 1 1 1 1 1 1 1]], shape=(1, 9), dtype=int32)


In [17]:
from transformers import FillMaskPipeline  # 허깅 페이스로부터 transformers 라이브러리 호출
pip = FillMaskPipeline(model=model, tokenizer=tokenizer)  # 학습된 언어모델 사용

In [18]:
pip('Soccer is a really fun [MASK].')

[{'score': 0.7621126770973206,
  'token': 4368,
  'token_str': 'sport',
  'sequence': 'soccer is a really fun sport.'},
 {'score': 0.20341919362545013,
  'token': 2208,
  'token_str': 'game',
  'sequence': 'soccer is a really fun game.'},
 {'score': 0.012208538129925728,
  'token': 2518,
  'token_str': 'thing',
  'sequence': 'soccer is a really fun thing.'},
 {'score': 0.0018630228005349636,
  'token': 4023,
  'token_str': 'activity',
  'sequence': 'soccer is a really fun activity.'},
 {'score': 0.001335486420430243,
  'token': 2492,
  'token_str': 'field',
  'sequence': 'soccer is a really fun field.'}]

In [21]:
pip("It's very [MASK] today")

[{'score': 0.2631320357322693,
  'token': 2980,
  'token_str': 'hot',
  'sequence': "it's very hot today"},
 {'score': 0.16468864679336548,
  'token': 3147,
  'token_str': 'cold',
  'sequence': "it's very cold today"},
 {'score': 0.09776042401790619,
  'token': 4251,
  'token_str': 'quiet',
  'sequence': "it's very quiet today"},
 {'score': 0.07596306502819061,
  'token': 4010,
  'token_str': 'warm',
  'sequence': "it's very warm today"},
 {'score': 0.03424697369337082,
  'token': 3835,
  'token_str': 'nice',
  'sequence': "it's very nice today"}]

In [22]:
#한국어 bert
model = TFBertForMaskedLM.from_pretrained('klue/bert-base', from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForMaskedLM: ['cls.predictions.decoder.bias', 'bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [23]:
inputs = tokenizer('축구는 정말 재미있는 [MASK]다.', return_tensors='tf')

In [27]:
pip = FillMaskPipeline(model=model, tokenizer=tokenizer)

In [28]:
pip('축구는 정말 재미있는 [MASK]다.')

[{'score': 0.8963516354560852,
  'token': 4559,
  'token_str': '스포츠',
  'sequence': '축구는 정말 재미있는 스포츠 다.'},
 {'score': 0.02595745585858822,
  'token': 568,
  'token_str': '거',
  'sequence': '축구는 정말 재미있는 거 다.'},
 {'score': 0.010033913888037205,
  'token': 3682,
  'token_str': '경기',
  'sequence': '축구는 정말 재미있는 경기 다.'},
 {'score': 0.007924334146082401,
  'token': 4713,
  'token_str': '축구',
  'sequence': '축구는 정말 재미있는 축구 다.'},
 {'score': 0.007844174280762672,
  'token': 5845,
  'token_str': '놀이',
  'sequence': '축구는 정말 재미있는 놀이 다.'}]

In [None]:
# NSP (Next Sentence Prediction)

In [29]:
import tensorflow as tf
from transformers import TFBertForNextSentencePrediction  # 두 문장의 관계 파악 모델
from transformers import AutoTokenizer  # 모델의 이름을 입력하면 해당 모델에 맞는 토크나이저 자동선택 및 생성

In [30]:
model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForNextSentencePrediction.

All the weights of TFBertForNextSentencePrediction were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForNextSentencePrediction for predictions without further training.


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [31]:
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." #이전문장
next_sentence = "pizza is eaten with the use of a knife and fork. In casual settings, however, \
it is cut into wedges to be eaten while held in the hand." # 다음문장

In [32]:
encoding = tokenizer(prompt, next_sentence, return_tensors='tf') # prompt. next_sentence 문장 모두 토큰화

In [33]:
print(encoding['input_ids'])

tf.Tensor(
[[  101  1999  3304  1010 10733  2366  1999  5337 10906  1010  2107  2004
   2012  1037  4825  1010  2003  3591  4895 14540  6610  2094  1012   102
  10733  2003  8828  2007  1996  2224  1997  1037  5442  1998  9292  1012
   1999 10017 10906  1010  2174  1010  2009  2003  3013  2046 17632  2015
   2000  2022  8828  2096  2218  1999  1996  2192  1012   102]], shape=(1, 58), dtype=int32)


In [34]:
print(tokenizer.cls_token, ':', tokenizer.cls_token_id)  #문장의 시작 기호의 인덱스
print(tokenizer.sep_token, ':' , tokenizer.sep_token_id)  # 문자의 구분 기호의 인덱스

[CLS] : 101
[SEP] : 102


In [35]:
print(tokenizer.decode(encoding['input_ids'][0]))

[CLS] in italy, pizza served in formal settings, such as at a restaurant, is presented unsliced. [SEP] pizza is eaten with the use of a knife and fork. in casual settings, however, it is cut into wedges to be eaten while held in the hand. [SEP]


In [36]:
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
softmax = tf.keras.layers.Softmax()
probs = softmax(logits)
print(probs)  # 결과값 [ 클래스 1(무관한 문장) 확률, 클래스 1(연속된 문장) 확률]

tf.Tensor([[9.9999714e-01 2.8381855e-06]], shape=(1, 2), dtype=float32)


In [None]:
# 한국어 문장 관계 파악

In [39]:
model = TFBertForNextSentencePrediction.from_pretrained('klue/bert-base', from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForNextSentencePrediction: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForNextSentencePrediction from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForNextSentencePrediction from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForNextSentencePrediction were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForNextSentencePrediction for predictions without further training.


In [40]:
# 이어지는 두 개의 문장
prompt = "2002년 월드컵 축구대회는 일본과 공동으로 개최되었던 세계적인 큰 잔치입니다."
next_sentence = "여행을 가보니 한국의 2002년 월드컵 축구대회의 준비는 완벽했습니다."
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]

softmax = tf.keras.layers.Softmax()
probs = softmax(logits)
print('최종 예측 레이블 :', tf.math.argmax(probs, axis=-1).numpy())

최종 예측 레이블 : [0]


In [41]:
# 상관없는 두 개의 문장
prompt = "2002년 월드컵 축구대회는 일본과 공동으로 개최되었던 세계적인 큰 잔치입니다."
next_sentence = "극장가서 로맨스 영화를 보고싶어요"
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]

softmax = tf.keras.layers.Softmax()
probs = softmax(logits)
print('최종 예측 레이블 :', tf.math.argmax(probs, axis=-1).numpy())

최종 예측 레이블 : [1]


In [42]:
import numpy as np
import random
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel

In [43]:
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.5.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'lm_head.weight', 'transformer.h.0.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [44]:
sent = '근육이 커지기 위해서는'

In [45]:
input_ids = tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids]) #tensor : 다차원배열
print(input_ids)

tf.Tensor([[33245 10114 12748 11357]], shape=(1, 4), dtype=int32)


In [48]:
output = model.generate(input_ids,
                        max_length=10,
                        repetition_penalty=2.0,
                        use_cache=True)
output_ids = output.numpy().tolist()[0]
print(output_ids)

[33245, 10114, 12748, 11357, 23879, 39306, 9684, 7884, 10211, 15177]


In [49]:
tokenizer.decode(output_ids)

'근육이 커지기 위해서는 무엇보다 규칙적인 생활습관이 중요하다.\n'

In [50]:
output = model(input_ids)
top5 = tf.math.top_k(output.logits[0, -1], k=5)

In [51]:
tokenizer.convert_ids_to_tokens(top5.indices.numpy())

['▁무엇보다', '▁우선', '▁반드시', '▁피부', '▁무엇보다도']

In [52]:
sent = '근육이 커지기 위해서는'
input_ids = tokenizer.encode(sent)

while len(input_ids) < 50:
    output = model(np.array([input_ids]))
    # Top 5의 단어들을 추출
    top5 = tf.math.top_k(output.logits[0, -1], k=5)
    # Top 5의 단어들 중 랜덤으로 다음 단어로 선택.
    token_id = random.choice(top5.indices.numpy())
    input_ids.append(token_id)

tokenizer.decode(input_ids)

'근육이 커지기 위해서는 피부 속 수분이 증발하면서 수분이 증발되는 것이 가장 중요하죠.\n그렇게 하면 피부가 촉각을 잃을 수 있고 수분 공급과 수분 공급이 잘되지 않는 등 부작용이 일어날 수가 있기 때문입니다. 고민이 많으니 바로 팩이나'