<a href="https://colab.research.google.com/github/gabie0208/chinese-nlp/blob/main/meaning_classification_qilai_macbert_large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import torch
import numpy as np
import transformers
from tqdm import tqdm

from transformers import BertTokenizer, BertModel, AdamW
from transformers import BertForMaskedLM,BertForSequenceClassification

import torch.nn.functional as F
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random
import time
import datetime

In [3]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [4]:
def read_file(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        l = []
        for line in lines:
            l.append(line.strip().split("\n"))
    return l

buyu_gilai_test = read_file("/content/qilai-test data.txt")
buyu_qilai_train = read_file("/content/qilai-train data.txt")

In [5]:
buyu_qilai_train[:10]

[['啊，我想起来了，去年李成日也托运过。\t2'],
 ['路上要用的东西放在手提包里，这样用起来方便。\t4'],
 ['那件事情说起来容易，做起来很难。\t4'],
 ['看起来不过二十出头。祝你生日快乐！\t4'],
 ['看起来，要下雨了。快走吧！\t4'],
 ['因为送钟跟送终同音，听起来不吉利。\t4'],
 ['看起来，你很像你爸爸。\t4'],
 ['这些菜看起来好像很好吃。\t4'],
 ['我也被举了起来。\t1'],
 ['主人和客人都哈哈大笑起来。\t3']]

In [6]:
def load_data(data_dir, *filenames):
    sentences = []

    for filename in filenames:
        with open(data_dir + filename, 'r') as f:
            for line in f.readlines():
                sentences.append(line.strip().split('\t')[0])
    return sentences, len(sentences)

In [7]:
def save_num(data_dir, *filenames):
  num = []
  for filename in filenames:
      with open(data_dir + filename, 'r') as f:
          for line in f.readlines():
              num.append(line.strip().split('\t')[1])
  return num

In [8]:
data_dir = '/content/'

qilai, total_qilai = load_data(data_dir, "qilai-train data.txt")
qilai_num = save_num(data_dir, "qilai-train data.txt")
print("起来 :", total_qilai)
print(len(qilai_num))

起来 : 5100
5100


In [9]:
MAX_LEN = 50

def preprocess(data, label):
    sentences = []
    labels = []
    nums = []
    error_cnt = 0
    for sent in data:
        # 해당 문장에 방향보어가 있는 지 확인
        if label not in sent:
            print(f"Sententce : {sent}")
            error_cnt += 1
            continue
        sent = sent[:MAX_LEN]
        #sent = sent.replace(label, '[MASK]')
        sentences.append(sent)
        labels.append(label)
    print(f"{label} 방향보어 없는 문장 개수 : {error_cnt}")
    return sentences, labels

In [10]:
qilai_sent, qilai_label= preprocess(qilai, '起来')

起来 방향보어 없는 문장 개수 : 0


In [11]:
qilai_sent[:5]

['啊，我想起来了，去年李成日也托运过。',
 '路上要用的东西放在手提包里，这样用起来方便。',
 '那件事情说起来容易，做起来很难。',
 '看起来不过二十出头。祝你生日快乐！',
 '看起来，要下雨了。快走吧！']

In [12]:
qilai_num[:5]

['2', '4', '4', '4', '4']

In [13]:
train_sent = qilai_sent
train_num = qilai_num

In [14]:
MODEL_TYPE = 'hfl/chinese-macbert-large'
MAX_SIZE = 100
BATCH_SIZE = 100

tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

In [15]:
tokenized_sent = [tokenizer.tokenize(sent) for sent in train_sent]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sent]

In [16]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 100
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids[0]

array([1557, 8024, 2769, 2682, 6629, 3341,  749, 8024, 1343, 2399, 3330,
       2768, 3189,  738, 2805, 6817, 6814,  511,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [17]:
num_list = ["1", "2", "3", "4"]
nums = [num_list.index(x) for x in train_num]

In [18]:
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [19]:
# 훈련셋과 검증셋으로 분리
train_inputs, validation_inputs, train_nums, validation_nums = train_test_split(input_ids, nums, random_state=1, test_size=0.1)

# 어텐션 마스크를 훈련셋과 검증셋으로 분리
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=1, test_size=0.1)

# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs)
#train_labels = torch.tensor(train_labels)
train_nums = torch.tensor(train_nums)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
#validation_labels = torch.tensor(validation_labels)
validation_nums = torch.tensor(validation_nums)
validation_masks = torch.tensor(validation_masks)

In [20]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_nums)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_nums)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [21]:
model = BertForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=4)
model.cuda()

Some weights of the model checkpoint at hfl/chinese-macbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [22]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW, BertConfig

# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 5

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [23]:
# 정확도 계산 함수
def flat_accuracy(preds, nums):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    nums_flat = nums.flatten()

    return np.sum(pred_flat == nums_flat) / len(nums_flat)

In [24]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [25]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

In [26]:
# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...

  Average training loss: 0.55
  Training epcoh took: 0:02:21

Running Validation...
  Accuracy: 0.88
  Validation took: 0:00:05

Training...

  Average training loss: 0.23
  Training epcoh took: 0:02:21

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:05

Training...

  Average training loss: 0.12
  Training epcoh took: 0:02:21

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:05

Training...

  Average training loss: 0.06
  Training epcoh took: 0:02:21

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:05

Training...

  Average training loss: 0.03
  Training epcoh took: 0:02:21

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:05

Training complete!


In [27]:
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [28]:
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, nums = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_nums = nums.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_nums)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu()

    return logits

In [29]:
logits = test_sentences(['啊，我想起来了，去年李成日也托运过。']) #answer 2
num_list = ["1", "2", "3", "4"]

print(logits)
print(num_list[np.argmax(logits)])

tensor([[-2.2805,  6.0634, -2.7965, -2.3935]])
2


In [30]:
logits = test_sentences(['路上要用的东西放在手提包里，这样用起来方便。']) #answer 4
num_list = ["1", "2", "3", "4"]

print(logits)
print(num_list[np.argmax(logits)])

tensor([[-2.1989,  0.5918, -3.2155,  5.9888]])
4


In [31]:
import torch.nn.functional as F

In [32]:
def predict_sentence():
    sent = input('Input Sentence')
    logits = test_sentences([sent])
    softmax = F.softmax(logits[0], dim=0).numpy()
    
    for l, p in zip(num_list, softmax):
        print(f"{l} : {p*100:.2f}%")

In [33]:
predict_sentence()

Input Sentence啊，我想起来了，去年李成日也托运过。
1 : 0.02%
2 : 99.94%
3 : 0.01%
4 : 0.02%


In [34]:
predict_sentence()

Input Sentence路上要用的东西放在手提包里，这样用起来方便。
1 : 0.03%
2 : 0.45%
3 : 0.01%
4 : 99.51%
