diff --git a/ci/batch/submit-job.py b/ci/batch/submit-job.py index ec99e44f47..5c48ab901c 100644 --- a/ci/batch/submit-job.py +++ b/ci/batch/submit-job.py @@ -92,7 +92,7 @@ def main(): spin = ['-', '/', '|', '\\', '-', '/', '|', '\\'] logGroupName = '/aws/batch/job' - jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128] # Enforce AWS Batch jobName rules + jobName = re.sub(r'[^A-Za-z0-9_\-]', '', args.name)[:128] # Enforce AWS Batch jobName rules jobQueue = args.job_queue jobDefinition = args.job_definition command = args.command.split() diff --git a/docs/examples/sentiment_analysis/bert-sentence-pair.png b/docs/examples/sentiment_analysis/bert-sentence-pair.png new file mode 100644 index 0000000000..1dc37953f4 Binary files /dev/null and b/docs/examples/sentiment_analysis/bert-sentence-pair.png differ diff --git a/docs/examples/sentiment_analysis/bert_fig2.png b/docs/examples/sentiment_analysis/bert_fig2.png new file mode 100644 index 0000000000..916b937045 Binary files /dev/null and b/docs/examples/sentiment_analysis/bert_fig2.png differ diff --git a/docs/examples/sentiment_analysis/index.rst b/docs/examples/sentiment_analysis/index.rst index 0e97ace35d..3a4225037d 100644 --- a/docs/examples/sentiment_analysis/index.rst +++ b/docs/examples/sentiment_analysis/index.rst @@ -4,7 +4,7 @@ Sentiment Analysis .. container:: cards .. card:: - :title: Fine-tuning LSTM-based Language Model + :title: Sentiment Analysis by Fine-tuning Word Language Model :link: sentiment_analysis.html See how to fine-tune a pre-trained language model to perform sentiment analysis on movie reviews. @@ -15,8 +15,12 @@ Sentiment Analysis See how to use GluonNLP to build more advanced model structure for extracting sentence embeddings to predict Yelp review rating. + + .. card:: + :title: Sentiment Analysis with KoBERT (Korean BERT) using Movie Review Data + :link: kobert_naver_movie.html - + See how to use KoBERT to implement a sentiment analysis of a Korean popular movie review dataset. .. toctree:: :hidden: @@ -24,4 +28,6 @@ Sentiment Analysis sentiment_analysis.ipynb self_attentive_sentence_embedding.ipynb + kobert_naver_movie.ipynb + diff --git a/docs/examples/sentiment_analysis/kobert_naver_movie.md b/docs/examples/sentiment_analysis/kobert_naver_movie.md new file mode 100644 index 0000000000..d7067c96b6 --- /dev/null +++ b/docs/examples/sentiment_analysis/kobert_naver_movie.md @@ -0,0 +1,358 @@ +# Sentiment Analysis with KoBERT using Movie Review Data +# KoBERT를 이용한 네이버 영화 리뷰 감정 분석 + +In this tutorial, we will implement a sentiment analysis of [Naver sentiment +movie corpus](https://github.com/e9t/nsmc) with a pretrained Korean BERT +(**KoBERT**) provided by GluonNLP's Model Zoo. + +이번 예제에서는 GluonNLP의 Model Zoo에서 제공하는 한국어 BERT(**KoBERT**)의 pretrained 모델로 네이버 +영화리뷰의 감정 분석(sentiment analysis)을 구현해 보겠습니다. + +## Setup +## 사전 설치 + +If you have a package listed below but not installed yet, uncomment the line and +install the file. + +아래 나열된 패키지 중에서 아직 설치되지 않은 것이 있다면, 커멘드 기호('#')를 제거하고 파일을 설치합니다. + +```{.python .input} +#!pip install tqdm +#!pip install mxnet-cu101 +#!pip install gluonnlp +#!pip install sentencepiece +``` + +```{.python .input} +import numpy as np +from mxnet.gluon import nn, rnn +from mxnet import gluon, autograd +import gluonnlp as nlp +from mxnet import nd +import mxnet as mx +import time +import itertools +import random +``` + +## Load KoBERT Model +## KoBERT 모델 로드 + +Load the pre-trained KoBERT model from GluonNLP's Model Zoo. + +사전학습한 KoBERT 모델을 GluonNLP의 Model Zoo에서 로드합니다. + +```{.python .input} +ctx = mx.gpu() +``` + +```{.python .input} +bert_base, vocab = nlp.model.bert.get_bert_model('bert_12_768_12', 'kobert_news_wiki_ko_cased', pretrained=True, use_pooler=False, use_decoder=False, use_classifier=False, ctx=ctx) +``` + +```{.python .input} +tokenizer = nlp.data.get_tokenizer('bert_12_768_12', 'kobert_news_wiki_ko_cased') +tok = nlp.data.BERTSPTokenizer(tokenizer._path, vocab, lower=False) +``` + +## Define Classifier +## Classifier 정의 + +Now define the `BERTClassifier` class to classify the positive or negative +reviews. +The last pooling layer (`pooler`) of the BERT model returns the embedding vector +of the `[CLS]` token. +You can simply configure `BERTClassifier` by linking this output to a fully +connected layer (`nn.Dense`) and softmax layer. +The Softmax output is the probability value of the positive/negative label, and +you can compare the predicted label with the ground truth label to calculate the +cross-entropy and update the model in a direction that minimizes the loss. + +이제 영화 리뷰의 긍정/부정을 분류할 `BERTClassifier` 클래스를 정의합니다. +BERT 모델 마지막의 pooling layer (`pooler`)는 `[CLS]` 토큰의 임베딩 벡터를 리턴하는데, +이 출력을 fully connected layer (`nn.Dense`)와 softmax layer로 연결해 간단하게 +`BERTClassifier`를 구성할 수 있습니다. +Softmax 출력은 긍정/부정 레이블의 확률 값이 되며, 예측한 레이블을 ground truth 레이블과 비교해 크로스 엔트로피를 계산하고 +손실을 최소화하는 방향으로 모델을 업데이트합니다. + +![bert-sentence-pair](./bert-sentence-pair.png) + +```{.python .input} +class BERTClassifier(nn.Block): + def __init__(self, + bert, + num_classes=2, + dropout=None, + prefix=None, + params=None): + super(BERTClassifier, self).__init__(prefix=prefix, params=params) + self.bert = bert + with self.name_scope(): + self.classifier = nn.HybridSequential(prefix=prefix) + if dropout: + self.classifier.add(nn.Dropout(rate=dropout)) + self.classifier.add(nn.Dense(units=num_classes)) + + def forward(self, inputs, token_types, valid_length=None): + pooler = self.bert(inputs, token_types, valid_length) + return self.classifier(pooler[:,0,:]) +``` + +```{.python .input} +model = BERTClassifier(bert_base, num_classes=2, dropout=0.1) +# initialize only the classifier +model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx) +model.hybridize() + +# softmax cross entropy loss for classification +loss_function = gluon.loss.SoftmaxCELoss() + +metric = mx.metric.Accuracy() +``` + +## Load the Movie Review Dataset +## 네이버 영화 리뷰 데이터 로드 + +Download Lucy Park's [Naver Sentimental Movie +Corpus](https://github.com/e9t/nsmc) dataset from Github. + +박은정님의 [Naver sentiment movie corpus](https://github.com/e9t/nsmc)를 Github에서 +다운로드합니다. + +```{.python .input} +!git clone https://github.com/e9t/nsmc.git +``` + +```{.python .input} +dataset_train = nlp.data.TSVDataset("nsmc/ratings_train.txt", field_indices=[1,2], num_discard_samples=1) +dataset_test = nlp.data.TSVDataset("nsmc/ratings_test.txt", field_indices=[1,2], num_discard_samples=1) +``` + +```{.python .input} +print(dataset_train[0:5]) +print(dataset_test[0:5]) +``` + +Next, define the class `BERTDataset` to load this data. +GluonNLP's [BERTSentenceTransform](https://gluon- +nlp.mxnet.io/api/modules/data.html#gluonnlp.data.BERTSentenceTransform) function +makes it easy to do the following pre-processing processes that BERT needs: + +1. Tokenize the input sentence, and insert '[CLS]' and '[SEP]' where necessary. +2. Because one or two sentences can be entered, create a type id that separates +them. +3. Calculate the effective length. + +다음으로, 이 데이터를 로드하는 `BERTDataset` 클래스를 정의합니다. +GluonNLP의 [BERTSentenceTransform](https://gluon- +nlp.mxnet.io/api/modules/data.html#gluonnlp.data.BERTSentenceTransform) 함수를 사용하면 +BERT가 필요로 하는 아래와 같은 전처리 과정을 쉽게 할 수 있습니다. + +1. 입력 문장을 tokenize 하고, `[CLS]`, `[SEP]`을 필요한 위치에 삽입합니다. +2. 문장이 하나 또는 두 개가 입력될 수 있으므로, 이를 구분하는 type id를 생성합니다. +3. 유효 길이를 계산합니다. + +![BERT 입력 표현(input representation)](./bert_fig2.png) + +```{.python .input} +class BERTDataset(mx.gluon.data.Dataset): + def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, + pad, pair): + transform = nlp.data.BERTSentenceTransform( + bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair) + sent_dataset = gluon.data.SimpleDataset([[ + i[sent_idx], + ] for i in dataset]) + self.sentences = sent_dataset.transform(transform) + self.labels = gluon.data.SimpleDataset( + [np.array(np.int32(i[label_idx])) for i in dataset]) + + def __getitem__(self, i): + return (self.sentences[i] + (self.labels[i], )) + + def __len__(self): + return (len(self.labels)) + +``` + +```{.python .input} +max_len = 128 +``` + +```{.python .input} +data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False) +data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False) +``` + +## Fine-tune the Model +## 모델 파인튜닝 + +Now that everything is ready, you can start fine-tuning. +We used **BERTAdam** as the optimizer. +BERTAdam is similar to Adam, but has a separate weight decay and gradient +update. +For more information, please refer to GluonNLP's [BERTAdam](https://gluon- +nlp.mxnet.io/api/modules/optimizer.html#gluonnlp.optimizer.BERTAdam) document. + +이제 모든 준비가 되었으므로 파인튜닝을 시작할 수 있습니다. +옵티마이저로는 **BERTAdam**을 사용했습니다. BERTAdam은 Adam과 유사하지만 weight decay와 gradient +update가 분리된 특징이 있습니다. 더 자세한 정보는 GluonNLP의 [BERTAdam](https://gluon- +nlp.mxnet.io/api/modules/optimizer.html#gluonnlp.optimizer.BERTAdam) 문서를 참고하시기 +바랍니다. + +```{.python .input} +batch_size = 32 +lr = 5e-5 + +train_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, num_workers=5) +test_dataloader = mx.gluon.data.DataLoader(data_test, batch_size=int(batch_size/2), num_workers=5) +``` + +```{.python .input} +trainer = gluon.Trainer(model.collect_params(), 'bertadam', + {'learning_rate': lr, 'epsilon': 1e-9, 'wd':0.01}) + +log_interval = 4 +num_epochs = 1 +``` + +Weight decay does not apply to LayerNorm and Bias. +That's because it has little effect of regularization, and to prevent the +possibility that it would rather cause underfitting. +For more information, please refer to [1]. + +LayerNorm과 Bias에는 weight decay를 적용하지 않습니다. +Regularization의 효과가 거의 없고 오히려 underfitting을 초래할 가능성을 방지하기 위해서입니다. +자세한 내용에 대해서는 [1]을 참고하시기 바랍니다. + +```{.python .input} +for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): + v.wd_mult = 0.0 +params = [ + p for p in model.collect_params().values() if p.grad_req != 'null' +] + +``` + +```{.python .input} +def evaluate_accuracy(model, data_iter, ctx=ctx): + acc = mx.metric.Accuracy() + i = 0 + for i, (t,v,s, label) in enumerate(data_iter): + token_ids = t.as_in_context(ctx) + valid_length = v.as_in_context(ctx) + segment_ids = s.as_in_context(ctx) + label = label.as_in_context(ctx) + output = model(token_ids, segment_ids, valid_length.astype('float32')) + acc.update(preds=output, labels=label) + if i > 1000: + break + i += 1 + return(acc.get()[1]) +``` + +At the beginning of the training, random weight can make it difficult to +converge. +To solve this problem, **learning rate warmup** method was proposed to start the +learning rate at first with a very small value and quickly increase it and then +slowly reduce it again[2]. +The code below uses the same approach. + +학습 초기에는 랜덤한 weight가 학습의 수렴을 방해할 수도 있습니다. +이에 대한 해결책으로, learning rate을 처음에 아주 작은 값으로 시작해서 빠르게 증가시키고 이후 다시 천천히 줄여나가는 +**learning rate warmup** 방식이 제안되었습니다[2]. +아래 코드에서도 이와 같은 방식을 사용합니다. + +```{.python .input} +accumulate = 4 +step_size = batch_size * accumulate if accumulate else batch_size +num_train_examples = len(data_train) +num_train_steps = int(num_train_examples / step_size * num_epochs) +warmup_ratio = 0.1 +num_warmup_steps = int(num_train_steps * warmup_ratio) +step_num = 0 +all_model_params = model.collect_params() +``` + +```{.python .input} +# Set grad_req if gradient accumulation is required +if accumulate and accumulate > 1: + for p in params: + p.grad_req = 'add' +``` + +```{.python .input} +for epoch_id in range(num_epochs): + metric.reset() + step_loss = 0 + for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader): + if step_num < num_warmup_steps: + new_lr = lr * step_num / num_warmup_steps + else: + non_warmup_steps = step_num - num_warmup_steps + offset = non_warmup_steps / (num_train_steps - num_warmup_steps) + new_lr = lr - offset * lr + trainer.set_learning_rate(new_lr) + with mx.autograd.record(): + # load data to GPU + token_ids = token_ids.as_in_context(ctx) + valid_length = valid_length.as_in_context(ctx) + segment_ids = segment_ids.as_in_context(ctx) + label = label.as_in_context(ctx) + + # forward computation + out = model(token_ids, segment_ids, valid_length.astype('float32')) + ls = loss_function(out, label).mean() + + # backward computation + ls.backward() + if not accumulate or (batch_id + 1) % accumulate == 0: + trainer.allreduce_grads() + nlp.utils.clip_grad_global_norm(params, 1) + trainer.update(accumulate if accumulate else 1) + step_num += 1 + if accumulate and accumulate > 1: + # set grad to zero for gradient accumulation + all_model_params.zero_grad() + + step_loss += ls.asscalar() + metric.update([label], [out]) + if (batch_id + 1) % (50) == 0: + print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, acc={:.3f}' + .format(epoch_id + 1, batch_id + 1, len(train_dataloader), + step_loss / log_interval, + trainer.learning_rate, metric.get()[1])) + step_loss = 0 + test_acc = evaluate_accuracy(model, test_dataloader, ctx) + print('Test Acc : {}'.format(test_acc)) +``` + +```{.python .input} +# clean up data +import shutil +shutil.rmtree('nsmc', True) +``` + +So far, we've implemented a sentiment classifier of Naver movie review dataset +using a pre-trained **KoBERT** model of GluonNLP Model Zoo. +In general, the sentiment analysis models of this dataset are known to obtain +accuracy of **83 to 85%**. +Our implementation using **GluonNLP/KoBERT** was able to achieve a very high +accuracy of about **89%** with just one epoch. + +지금까지 GluonNLP Model Zoo의 **KoBERT** 사전학습 모델을 사용해 네이버 영화리뷰의 감정 분석 분류기를 구현해봤습니다. +일반적으로 네이버 영화리뷰의 감성 분석 모델은 **83~85%** 정도의 accuracy를 얻는다고 알려져 있는데, +위의 **GluonNLP/KoBERT** 구현에서는 1번의 epoch 만으로도 약 **89%**의 대단히 높은 accuracy를 얻을 수 +있었습니다. + +## References +## 참고 문헌 + +[1] Jia et.al., “[Highly Scalable Deep Learning Training System with Mixed- +Precision: Training ImageNet in Four +Minutes.](https://arxiv.org/abs/1807.11205)” arXiv preprint arXiv:1807.11205 +(2018). + +[2] Goyal et.al., “[Accurate, Large Minibatch SGD: Training ImageNet in 1 +Hour.](https://arxiv.org/abs/1706.02677)” arXiv preprint arXiv:1706.02677 +(2018). diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml index 55d93d2e54..c811041cd3 100644 --- a/env/cpu/py3.yml +++ b/env/cpu/py3.yml @@ -33,7 +33,7 @@ dependencies: - flaky==3.6.1 - flake8==3.7.9 - mock<3 - - mxnet==1.6.0 + - mxnet<2 - scipy==1.3.2 - regex==2020.4.4 - nltk==3.4.5