<a href="https://colab.research.google.com/github/godpeny/laboratory/blob/master/Study/NLP_Using_Deep_Learning/Bidirectional_Encoder_Representation_From_Transformer/naver_movie_reivew_classification_with_bert_ko_no_tpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Default Setting For Using TPU in Google Colab

In [1]:
import numpy as np
import pandas as pd
import urllib.request

import tensorflow as tf
import keras

import transformers

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

print(train_data.tail(10))
print(test_data.tail(10))

              id                                           document  label
149990   6373651                                       이걸 영화라고 찎었냐?      0
149991   9492905  http://blog.naver.com/oroblast/220215679580 나쁜...      1
149992   9335962  공포나 재난영화가 아니라 아예 대놓고 비급 크리쳐개그물임ㅋㅋ 음악 완전 흥겹다ㅋ 5...      0
149993  10020916                 For Carl.칼 세이건으로 시작해서 칼 세이건으로 끝난다.      1
149994   9458520               디케이드 다음에 더블 다음에 오즈인데 더블은 조금밖에 안나오네요.      1
149995   6222902                                인간이 문제지.. 소는 뭔죄인가..      0
149996   8549745                                      평점이 너무 낮아서...      1
149997   9311800                    이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?      0
149998   2376369                        청춘 영화의 최고봉.방황과 우울했던 날들의 자화상      1
149999   9619869                           한국 영화 최초로 수간하는 내용이 담긴 영화      0
            id                                        document  label
49990  9757200                           제발 국뽕김치영화좀 그만 만들어라...      0
49991  9653062                     

# Data Preprocessing

In [3]:
# check len
print(len(train_data), len(test_data))

# check null and drop
print(train_data.isnull().values.any(), test_data.isnull().values.any())
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
print(train_data.isnull().values.any(), test_data.isnull().values.any())

# check duplicated and drop
print(train_data.nunique(), test_data.nunique())
train_data.drop_duplicates(subset=['document'], inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)
print(train_data.nunique(), test_data.nunique())

print(len(train_data), len(test_data))

150000 50000
True True
False False
id          149995
document    146182
label            2
dtype: int64 id          49997
document    49157
label           2
dtype: int64
id          146182
document    146182
label            2
dtype: int64 id          49157
document    49157
label           2
dtype: int64
146182 49157


# Tokenizing

In [4]:
from transformers import BertTokenizerFast

In [5]:
tokenizer = BertTokenizerFast.from_pretrained('klue/bert-base')

X_train_list = train_data['document'].tolist()
X_test_list = test_data['document'].tolist()
X_train = tokenizer(X_train_list, padding=True, truncation=True) # padded and truncated to max_length
X_test = tokenizer(X_test_list, padding=True, truncation=True)

y_train = train_data['label'].tolist()
y_test = test_data['label'].tolist()

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [6]:
# check tokenized data
print(X_train[0].tokens)
print(X_train[0].ids)
print(X_train[0].type_ids)
print(X_train[0].attention_mask)

['[CLS]', '아', '더', '##빙', '.', '.', '진짜', '짜증', '##나', '##네', '##요', '목소리', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

# Modeling

In [7]:
import tensorflow as tf
from keras.callbacks import EarlyStopping

from transformers import TFBertForSequenceClassification

In [8]:
train_data = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
test_data = tf.data.Dataset.from_tensor_slices((dict(X_test), y_test))

In [9]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model = TFBertForSequenceClassification.from_pretrained('klue/bert-base', num_labels=2, from_pt=True)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])
model.summary()

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  110617344 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 110618882 (421.98 MB)
Trainable params: 110618882 (421.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
history = model.fit(train_data.shuffle(1000).batch(32), epochs=2, batch_size=32, validation_data=test_data.shuffle(1000).batch(32), callbacks=[early_stopping])

Epoch 1/2
 151/4569 [..............................] - ETA: 1:03:44 - loss: 0.3931 - accuracy: 0.8260

KeyboardInterrupt: ignored

# Testing

In [12]:
model.evaluate(test_data.batch(1024))



[0.3414138853549957, 0.8554427623748779]

In [14]:
from transformers import TextClassificationPipeline

In [15]:
text_classifier = TextClassificationPipeline(tokenizer=tokenizer, model=model, framework='tf', return_all_scores=True)



In [16]:
text_classifier(' 뭐야 이 평점들은.... 나쁘진 않지만 10점짜리는 더더욱 아니잖아')[0]

[{'label': 'LABEL_0', 'score': 0.9528833031654358},
 {'label': 'LABEL_1', 'score': 0.04711668938398361}]