In [1]:
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torchnlp.datasets import imdb_dataset

import pandas as pd
import numpy as np
import random as rn
import time
import datetime

In [2]:
import os

n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

1
NVIDIA GeForce RTX 2070


In [3]:
# imdb데이터 사용, 순차적인 데이터를 셔플
train, test = imdb_dataset(train=True, test=True)
rn.shuffle(train)
rn.shuffle(test)

train = train[:2000]
test = test[:200]

train = pd.DataFrame(train)
test = pd.DataFrame(test)

# 라벨을 0과 1로 변경
change = {'neg' : 0, 'pos' : 1}
train = train.replace({'sentiment' : change})
test = test.replace({'sentiment' : change})

print(train.shape)
print(test.shape)

(2000, 2)
(200, 2)


In [4]:
# 문장의 시작은 CLS, 끝은 SEP를 추가하여 표시함
document_bert = ["[CLS] " + str(s) + " [SEP]" for s in train.text]
document_bert[:5]

["[CLS] I just want to say that this production is very one sided, breaks the impartiality needed if you want to be taken seriously. <br /><br />There are no credits of the persons they interviewed, so you cant have an idea if they are worthy of being heard.<br /><br />Tells the story from just one point of view. To do this is very dangerous, because the next generations learns the bad idea, and thats why wars keep coming. I know this is not the only reason about wars, but doesn't help either.<br /><br />you can watch this documentary, but read in the internet a lot, before. Balcans are complex as human history is. [SEP]",
 "[CLS] This was a very good film. I didn't go into it with very high expectations and was pleasantly surprised by the acting, the script, and the scenery. Miranda Richardson was fantastic and so was Joan Plowright. They stole the show. But the other actors played their parts wonderfully also. Very enjoyable film. [SEP]",
 "[CLS] It is nice to see Suraj Barjatya back

In [5]:
# 사전 학습되어 있는 Bert모델의 Tokenizer를 이용하여 문장을 토큰화 시킴
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(s) for s in document_bert]
print(tokenized_texts[0])

['[CLS]', 'I', 'just', 'want', 'to', 'say', 'that', 'this', 'production', 'is', 'very', 'one', 'side', '##d', ',', 'breaks', 'the', 'im', '##parti', '##ali', '##ty', 'needed', 'if', 'you', 'want', 'to', 'be', 'taken', 'seriously', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'There', 'are', 'no', 'credits', 'of', 'the', 'persons', 'they', 'interviewed', ',', 'so', 'you', 'can', '##t', 'have', 'an', 'idea', 'if', 'they', 'are', 'worth', '##y', 'of', 'being', 'heard', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'Tell', '##s', 'the', 'story', 'from', 'just', 'one', 'point', 'of', 'view', '.', 'To', 'do', 'this', 'is', 'very', 'dangerous', ',', 'because', 'the', 'next', 'generations', 'learns', 'the', 'bad', 'idea', ',', 'and', 'that', '##s', 'why', 'wars', 'keep', 'coming', '.', 'I', 'know', 'this', 'is', 'not', 'the', 'only', 'reason', 'about', 'wars', ',', 'but', 'doesn', "'", 't', 'help', 'either', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'you', 'can', 'watch', 'this', 'docu

In [6]:
# 전체 문장에서 최대 토큰 수인 문장의 토큰 수 보다 크게 MAX_LEN설정
# 토큰화된 데이터를 맵핑된 값으로 변경하고, MAX_LEN에 맞게 패딩
MAX_LEN = 512
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
input_ids[0]

array([  101,   146, 12820, 21528, 10114, 23763, 10189, 10531, 12116,
       10124, 12558, 10464, 12250, 10162,   117, 68307, 10105, 10211,
       96503, 13133, 11195, 23794, 12277, 13028, 21528, 10114, 10347,
       15109, 75543,   119,   133, 33989,   120,   135,   133, 33989,
         120,   135, 11723, 10301, 10192, 48357, 10108, 10105, 34759,
       10689, 94501,   117, 10380, 13028, 10944, 10123, 10529, 10151,
       14932, 12277, 10689, 10301, 43509, 10157, 10108, 11223, 32240,
         119,   133, 33989,   120,   135,   133, 33989,   120,   135,
       29091, 10107, 10105, 13617, 10188, 12820, 10464, 12331, 10108,
       17904,   119, 11469, 10149, 10531, 10124, 12558, 57195,   117,
       12373, 10105, 13451, 71328, 91155, 10105, 15838, 14932,   117,
       10111, 10189, 10107, 31237, 68756, 23819, 23959,   119,   146,
       21852, 10531, 10124, 10472, 10105, 10893, 27949, 10978, 68756,
         117, 10473, 47798,   112,   188, 15217, 16106,   119,   133,
       33989,   120,

In [7]:
# 학습 속도를 높이기 위해 실제 데이터가 있는 곳을 1, 패딩된 곳을 0으로 표시한 mask
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [8]:
# 학습용 데이터를 학습용, 검증용 두가지로 분리
# 모델이 과적합되지 않게 함
train_inputs, validation_inputs, train_labels, validation_labels = \
train_test_split(input_ids, train['sentiment'].values, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=42, 
                                                       test_size=0.1)

In [9]:
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [10]:
BATCH_SIZE = 4

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

In [11]:
# test data도 똑같이 전처리
sentences = test['text']
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = test['sentiment'].values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
    
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)


In [12]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2070


In [13]:
# BERT 모델 생성, 이진 분류 이므로 num_labels = 2
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [14]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 4

# 총 훈련 스텝
total_steps = len(train_dataloader) * epochs

# lr 조금씩 감소시키는 스케줄러
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
#워닝은 무시



In [15]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [16]:
for step, batch in enumerate(train_dataloader):
    b_input_ids, b_input_mask, b_labels = batch
    print(b_input_ids, b_input_mask, b_labels)
    break

tensor([[  101, 13885, 75980,  ...,     0,     0,     0],
        [  101, 30109, 10111,  ..., 10105, 99843, 82439],
        [  101, 10377,   112,  ...,     0,     0,     0],
        [  101, 26820, 10454,  ...,     0,     0,     0]], dtype=torch.int32) tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]]) tensor([1, 0, 0, 0])


In [17]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
rn.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...

  Average training loss: 0.70
  Training epcoh took: 0:02:20

Running Validation...
  Accuracy: 0.47
  Validation took: 0:00:05

Training...

  Average training loss: 0.64
  Training epcoh took: 0:02:20

Running Validation...
  Accuracy: 0.75
  Validation took: 0:00:05

Training...

  Average training loss: 0.54
  Training epcoh took: 0:02:19

Running Validation...
  Accuracy: 0.77
  Validation took: 0:00:05

Training...

  Average training loss: 0.41
  Training epcoh took: 0:02:18

Running Validation...
  Accuracy: 0.81
  Validation took: 0:00:05

Training complete!


In [18]:
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))


Accuracy: 0.84
Test took: 0:00:05
