In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [2]:
import transformers
transformers.__version__

'4.26.0'

In [3]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [4]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7fc698537b20>)

In [5]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [6]:
print(train_data.shape)
print(test_data.shape)

(150000, 3)
(50000, 3)


In [7]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [8]:
train_data.dropna(how='any', inplace=True)
train_data.reset_index(drop=True, inplace=True)
train_data.isnull().values.any()

False

In [9]:
test_data.dropna(how='any', inplace=True)
test_data.reset_index(drop=True, inplace=True)
test_data.isnull().values.any()

False

In [10]:
print(train_data.shape)
print(test_data.shape)

(149995, 3)
(49997, 3)


In [11]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [12]:
print(tokenizer.encode("너희들은 안늙었구나.."))

[2, 6928, 2031, 2073, 1378, 2875, 2359, 6074, 18, 18, 3]


In [13]:
print(tokenizer.encode("첫타임으로 보고왔는데 걍미쳤음 주인공이 송태섭이라서 그런지몰라도 후반 산왕프레스 뚫을때 소름끼치더라 마지막 하이파이브장면도 지리고"))

[2, 1656, 15302, 6233, 4530, 2907, 13964, 566, 2044, 2954, 2053, 5093, 2052, 1288, 2260, 2474, 2052, 2181, 2112, 9408, 2549, 5121, 5683, 1235, 2305, 23578, 925, 2069, 2775, 16109, 2274, 2225, 23677, 4178, 4899, 11822, 2645, 14014, 2119, 6959, 2088, 3]


In [14]:
for elem in tokenizer.encode("첫타임으로 보고왔는데 걍미쳤음 주인공이 송태섭이라서 그런지몰라도 후반 산왕프레스 뚫을때 소름끼치더라 마지막 하이파이브장면도 지리고"):
  print(tokenizer.decode(elem))

[ C L S ]
첫
# # 타 임
# # 으 로
보 고
# # 왔
# # 는 데
걍
# # 미
# # 쳤
# # 음
주 인 공
# # 이
송
# # 태
# # 섭
# # 이
# # 라
# # 서
그 런 지
# # 몰
# # 라 도
후 반
산
# # 왕
# # 프 레 스
뚫
# # 을
# # 때
소 름
# # 끼
# # 치
# # 더 라
마 지 막
하 이
# # 파 이
# # 브
# # 장 면
# # 도
지 리
# # 고
[ S E P ]


In [15]:
print(tokenizer.tokenize("종이로 인쇄되어있던 산왕전이 살아 움직이는 그자체로 평점 만점.미완으로 마쳤던 원작 송태섭의 완성."))
print(len(tokenizer.tokenize("종이로 인쇄되어있던 산왕전이 살아 움직이는 그자체로 평점 만점.미완으로 마쳤던 원작 송태섭의 완성.")))

['종이', '##로', '인쇄', '##되', '##어', '##있', '##던', '산', '##왕', '##전', '##이', '살아', '움직이', '##는', '그자', '##체', '##로', '평점', '만점', '.', '미완', '##으로', '마쳤', '##던', '원작', '송', '##태', '##섭', '##의', '완성', '.']
31


In [16]:
print(tokenizer.encode("종이로 인쇄되어있던 산왕전이 살아 움직이는 그자체로 평점 만점.미완으로 마쳤던 원작 송태섭의 완성."))
print(len(tokenizer.encode("종이로 인쇄되어있던 산왕전이 살아 움직이는 그자체로 평점 만점.미완으로 마쳤던 원작 송태섭의 완성.")))

[2, 5982, 2200, 9017, 2496, 2051, 2689, 2414, 1235, 2305, 2165, 2052, 4996, 5375, 2259, 20324, 2093, 2200, 20609, 10732, 18, 21564, 6233, 7994, 2414, 9990, 1288, 2260, 2474, 2079, 4976, 18, 3]
33


In [17]:
print(tokenizer.decode(2))
print(tokenizer.decode(3))

[ C L S ]
[ S E P ]


In [18]:
print(tokenizer.cls_token, ':', tokenizer.cls_token_id)
print(tokenizer.sep_token, ':' , tokenizer.sep_token_id)
print(tokenizer.pad_token, ':', tokenizer.pad_token_id)

[CLS] : 2
[SEP] : 3
[PAD] : 0


In [19]:
max_seq_len = 128

In [20]:
encoded_result = tokenizer.encode("종이로 인쇄되어있던 산왕전이 살아 움직이는 그자체로 평점 만점.미완으로 마쳤던 원작 송태섭의 완성.", max_length=max_seq_len, pad_to_max_length=True)
print(encoded_result)
print('encoding length :', len(encoded_result))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[2, 5982, 2200, 9017, 2496, 2051, 2689, 2414, 1235, 2305, 2165, 2052, 4996, 5375, 2259, 20324, 2093, 2200, 20609, 10732, 18, 21564, 6233, 7994, 2414, 9990, 1288, 2260, 2474, 2079, 4976, 18, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
encoding length : 128




In [21]:
segment_input = [0]*max_seq_len
print(segment_input)
print(len(segment_input))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
128


In [22]:
valid_num = len(tokenizer.encode("종이로 인쇄되어있던 산왕전이 살아 움직이는 그자체로 평점 만점.미완으로 마쳤던 원작 송태섭의 완성."))
mask_input = valid_num * [1] + (max_seq_len - valid_num) * [0]
print(valid_num)
print(mask_input)
print(len(mask_input))

33
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
128


In [23]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):

    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []

    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [24]:
train_data['document'].head()

0                                  아 더빙.. 진짜 짜증나네요 목소리
1                    흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
2                                    너무재밓었다그래서보는것을추천한다
3                        교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
4    사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...
Name: document, dtype: object

In [25]:
train_data['label']

0         0
1         1
2         0
3         0
4         1
         ..
149990    0
149991    1
149992    0
149993    1
149994    0
Name: label, Length: 149995, dtype: int64

In [26]:
train_X, train_y = convert_examples_to_features(train_data['document'], train_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 149995/149995 [01:48<00:00, 1380.20it/s]


In [27]:
train_X[0]

array([[   2, 1376,  831, ...,    0,    0,    0],
       [   2, 1963,   18, ...,    0,    0,    0],
       [   2,    1,    3, ...,    0,    0,    0],
       ...,
       [   2, 4380, 1097, ...,    0,    0,    0],
       [   2, 9300, 3771, ...,    0,    0,    0],
       [   2, 3629, 3771, ...,    0,    0,    0]])

In [28]:
input_id = train_X[0][0]
attention_mask = train_X[1][0]
token_type_id = train_X[2][0]
label = train_y[0]

print('단어에 대한 정수 인코딩 :',input_id)
print()
print('어텐션 마스크 :',attention_mask)
print()
print('세그먼트 인코딩 :',token_type_id)
print()
print('각 인코딩의 길이 :', len(input_id))
print()
print('정수 인코딩 복원 :',tokenizer.decode(input_id))
print()
print('레이블 :',label)

단어에 대한 정수 인코딩 : [   2 1376  831 2604   18   18 4229 9801 2075 2203 2182 4243    3    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]

어텐션 마스크 : [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

세그먼트 인코딩 : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [29]:
model = TFBertModel.from_pretrained('klue/bert-base', from_pt=True)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'bert.embeddings.position_ids', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [30]:
max_seq_len = 128

In [31]:
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)

outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

In [32]:
outputs

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'tf_bert_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [33]:
print(outputs[0])

KerasTensor(type_spec=TensorSpec(shape=(None, 128, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/batchnorm/add_1:0', description="created by layer 'tf_bert_model'")


In [34]:
print(outputs[1])

KerasTensor(type_spec=TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/pooler/dense/Tanh:0', description="created by layer 'tf_bert_model'")


In [35]:
class TFBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(1,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='sigmoid',
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [37]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

<tensorflow.python.tpu.topology.Topology at 0x7fc6953daa60>

In [38]:
strategy = tf.distribute.experimental.TPUStrategy(resolver)



In [39]:
with strategy.scope():
  model = TFBertForSequenceClassification("klue/bert-base")
  optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
  loss = tf.keras.losses.BinaryCrossentropy()
  model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'bert.embeddings.position_ids', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [41]:
model.fit(train_X, train_y, epochs=2, batch_size=64, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc690674100>

In [43]:
test_X, test_y = convert_examples_to_features(test_data['document'], test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 49997/49997 [00:38<00:00, 1298.30it/s]


In [56]:
print(len(test_X[0]))
print(test_y.shape)

49997
(49997,)


In [57]:
results = model.evaluate(test_X, test_y, batch_size=1024)
print("test loss, test acc: ", results)

test loss, test acc:  [0.2601080536842346, 0.898193895816803]


In [58]:
def sentiment_predict(new_sentence):
  input_id = tokenizer.encode(new_sentence, max_length=max_seq_len, pad_to_max_length=True)

  padding_count = input_id.count(tokenizer.pad_token_id)
  attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
  token_type_id = [0] * max_seq_len

  input_ids = np.array([input_id])
  attention_masks = np.array([attention_mask])
  token_type_ids = np.array([token_type_id])

  encoded_input = [input_ids, attention_masks, token_type_ids]
  score = model.predict(encoded_input)[0][0]

  if(score > 0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
  else:
    print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))

In [60]:
sentiment_predict('시대를 초월한 걸작의 감동')

99.08% 확률로 긍정 리뷰입니다.



In [61]:
sentiment_predict('이영화가 왜 이리 이슈가 된거죠?늙어서 근가? 별루 인던데..거참')

77.17% 확률로 부정 리뷰입니다.



In [62]:
sentiment_predict(' 야 이걸 2시간 넘게 질질 끈다고..?')

99.72% 확률로 부정 리뷰입니다.



In [63]:
sentiment_predict('이미 만화책에 나온 내용이 반복해서 나온 게 대부분이고 주인공이 바껴서 농구를 하게 된 계기(?), 다른 등장인물의 비하인드 스토리 조금 소개하는 정도이지 새로운 게 별로 없음.음향, 노래도 연결이 매끄럽지 못함. ...')

99.69% 확률로 부정 리뷰입니다.

