<a href="https://colab.research.google.com/github/godpeny/laboratory/blob/master/Study/NLP_Using_Deep_Learning/Bidirectional_Encoder_Representation_From_Transformer/naver_movie_reivew_classification_with_bert_ko.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Default Setting For Using TPU in Google Colab

In [1]:
import tensorflow as tf
import os

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

<tensorflow.python.tpu.topology.Topology at 0x7aa4baef8bb0>

In [2]:
strategy = tf.distribute.TPUStrategy(resolver)

In [5]:
import numpy as np
import pandas as pd
import urllib.request

import tensorflow as tf
import keras

import transformers

In [7]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

print(train_data.tail(10))
print(test_data.tail(10))

              id                                           document  label
149990   6373651                                       이걸 영화라고 찎었냐?      0
149991   9492905  http://blog.naver.com/oroblast/220215679580 나쁜...      1
149992   9335962  공포나 재난영화가 아니라 아예 대놓고 비급 크리쳐개그물임ㅋㅋ 음악 완전 흥겹다ㅋ 5...      0
149993  10020916                 For Carl.칼 세이건으로 시작해서 칼 세이건으로 끝난다.      1
149994   9458520               디케이드 다음에 더블 다음에 오즈인데 더블은 조금밖에 안나오네요.      1
149995   6222902                                인간이 문제지.. 소는 뭔죄인가..      0
149996   8549745                                      평점이 너무 낮아서...      1
149997   9311800                    이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?      0
149998   2376369                        청춘 영화의 최고봉.방황과 우울했던 날들의 자화상      1
149999   9619869                           한국 영화 최초로 수간하는 내용이 담긴 영화      0
            id                                        document  label
49990  9757200                           제발 국뽕김치영화좀 그만 만들어라...      0
49991  9653062                     

# Data Preprocessing

In [8]:
# check len
print(len(train_data), len(test_data))

# check null and drop
print(train_data.isnull().values.any(), test_data.isnull().values.any())
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
print(train_data.isnull().values.any(), test_data.isnull().values.any())

# check duplicated and drop
print(train_data.nunique(), test_data.nunique())
train_data.drop_duplicates(subset=['document'], inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)
print(train_data.nunique(), test_data.nunique())

print(len(train_data), len(test_data))

150000 50000
True True
False False
id          149995
document    146182
label            2
dtype: int64 id          49997
document    49157
label           2
dtype: int64
id          146182
document    146182
label            2
dtype: int64 id          49157
document    49157
label           2
dtype: int64
146182 49157


# Tokenizing

In [9]:
def tokenize(sentences, labels, max_seq_len, tokenizer):
    tokenized_sentences, token_type_ids, attention_masks, output_labels = [], [], [], []

    for sentence, label in zip(sentences, labels):
        # sentence tokenizing
        tokenized_sentence = tokenizer.encode(sentence, max_length=max_seq_len, pad_to_max_length=True)
        # attention masking
        padding_count = tokenized_sentence.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count # 1 for token, 0 for padding
        # segment encoding
        token_type_id = [0] * max_seq_len # 0 for sentence 1, 1 for sentence 2. in this case, all 0

        # validation
        assert len(tokenized_sentence) == max_seq_len, "tokenized sentence length is not equal to max_seq_len"
        assert len(attention_mask) == max_seq_len, "attention mask length is not equal to max_seq_len"
        assert len(token_type_id) == max_seq_len, "token type id length is not equal to max_seq_len"

        tokenized_sentences.append(tokenized_sentence)
        token_type_ids.append(token_type_id)
        attention_masks.append(attention_mask)
        output_labels.append(label)

    # make numpy array
    tokenized_sentences = np.array(tokenized_sentences, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    output_labels = np.asarray(output_labels, dtype=np.int32)

    return (tokenized_sentences, token_type_ids, attention_masks), output_labels

In [11]:
max_seq_len = 128
tokenizer= transformers.BertTokenizer.from_pretrained('klue/bert-base')

X_train, y_train = tokenize(sentences=train_data['document'], labels=train_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)
X_test, y_test = tokenize(sentences=test_data['document'], labels=test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
# check sample
sample_tokenized_sentences = X_train[0][0]
sample_token_type_ids = X_train[0][1]
sample_attention_masks = X_train[0][2]
sample_output_labels = y_train[0]

print(sample_tokenized_sentences)
print(sample_token_type_ids)
print(sample_attention_masks)
print(tokenizer.decode(sample_tokenized_sentences))
print(sample_output_labels)

[   2 1376  831 2604   18   18 4229 9801 2075 2203 2182 4243    3    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[    2  1963    18    18    18 11811  2178  2088 28883 16516  2776    18
    18    18    18 10737  2156  2015  2446  2232  6758  2118  1380  6074
     3     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0   

# Many-To-One Modeling With BERT

In [13]:
class TFBertForSequenceClassification(keras.Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = transformers.TFBertModel.from_pretrained(model_name, from_pt=True) # Load the model weights from a PyTorch state_dict save file (see docstring of `pretrained_model_name_or_path` argument).
        self.classifier = tf.keras.layers.Dense(
            units=1,
            activation='sigmoid',
            kernel_initializer=keras.initializers.TruncatedNormal(mean=0.02),
            name='classifier')

    def call(self, inputs):
        tokenized_sentences, token_type_ids, attention_masks = inputs
        outputs = self.bert(
            tokenized_sentences,
            token_type_ids=token_type_ids,
            attention_mask=attention_masks)
        cls_token = outputs[1] # [CLS]
        predictions = self.classifier(cls_token)

        return predictions


### Output of Bert Model
 - 1st element : last_hidden_state - sequence of hidden-states at the output of the last layer of the model. (batch_size, sequence_length, hidden_size) -> used for Many-To-Many modeling.
 - 2nd element: pooler_output - last layer hidden-state of the first token of the sequence (classification token == [CLS]) further processed by a Linear layer and a Tanh activation function. (batch_size, hidden_size) -> used for Many-To-One modeling.

### Truncated Normal Distribution
 -  truncated normal distribution is the probability distribution derived from that of a normally distributed random variable by bounding the random variable from either below or above (or both).
 - 정규 분포(Normal Distribution) 에서 최솟값( )보다 작거나 최댓값( )보다 큰 값을 제거한 확률 분포 형태

In [14]:
with strategy.scope():
  model = TFBertForSequenceClassification(model_name='klue/bert-base')
  optimizer = keras.optimizers.Adam(learning_rate=5e-5)
  loss = keras.losses.BinaryCrossentropy()
  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [15]:
model.fit(X_train, y_train, epochs=2, batch_size=64, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7aa3dbfa2410>

In [17]:
results = model.evaluate(X_test, y_test, batch_size=1024)
print("test loss, test acc: ", results)

test loss, test acc:  [0.25816503167152405, 0.8974510431289673]


# Prediction

In [25]:
def sentiment_predict(new_sentence):
  input_id = tokenizer.encode(new_sentence, max_length=max_seq_len,pad_to_max_length=True)
  padding_count = input_id.count(tokenizer.pad_token_id)
  attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
  token_type_id = [0] * max_seq_len

  input_ids = np.array([input_id])
  attention_masks = np.array([attention_mask])
  token_type_ids = np.array([token_type_id])
  encoded_input = [input_ids, token_type_ids, attention_masks]

  score = model.predict(encoded_input)
  print(score)
  score = score[0][0]
  print(score)

  if(score > 0.5):
    print("{:.2f}% 확 률 로 긍 정 리 뷰 입 니 다 .\n".format(score * 100))
  else:
    print("{:.2f}% 확 률 로 부 정 리 뷰 입 니 다 .\n".format((1 - score) * 100))

In [26]:
sentiment_predict("이 영 화 존 잼 입 니 다 대 박 ")



[[0.81394494]]
0.81394494
81.39% 확 률 로 긍 정 리 뷰 입 니 다 .



In [27]:
sentiment_predict('이 영화 핵노잼 ㅠㅠ')

[[0.00708029]]
0.0070802867
99.29% 확 률 로 부 정 리 뷰 입 니 다 .

