In [1]:
import gdown
file_id = '1CSfABalJZpKpwSUcGHSWzCplfmdHREms'
download_url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(download_url, 'konli.zip', quiet=False)
!unzip konli.zip

Downloading...
From (original): https://drive.google.com/uc?id=1CSfABalJZpKpwSUcGHSWzCplfmdHREms
From (redirected): https://drive.google.com/uc?id=1CSfABalJZpKpwSUcGHSWzCplfmdHREms&confirm=t&uuid=8928cce4-8831-4cab-bbe8-dba148efb3e5
To: /content/konli.zip
100%|██████████| 41.7M/41.7M [00:00<00:00, 66.5MB/s]


Archive:  konli.zip
  inflating: xnli.dev.ko.tsv         
  inflating: snli_1.0_train.kor.tsv  
  inflating: multinli.train.ko.tsv   
  inflating: xnli.test.ko.tsv        


In [2]:
# 필요라이브러리
import pandas as pd

In [3]:
TRAIN_MULTINLI_DF = pd.read_csv('multinli.train.ko.tsv', sep='\t',on_bad_lines ='skip')
print(TRAIN_MULTINLI_DF['gold_label'].unique())
# entailment : 첫번째 문장이 두번째 문장을 의미
# contradiction : 첫번재 문장이 두번째 문장을 의미하자 않는 것
# neutral : 첫번째 문장이 두번째 문장을 의미하거나 반대하지 않는경우
TRAIN_MULTINLI_DF.tail()

['neutral' 'entailment' 'contradiction' nan]


Unnamed: 0,sentence1,sentence2,gold_label
385489,"분명히, 캘리포니아는 더 잘 할 수 있고, 더 잘해야 한다.",캘리포니아는 더 잘할 수 없다.,contradiction
385490,"한때 유럽에서 가장 아름다운 거리로 여겨졌는데, 이는 원래의 많은 건물들이 교체되었...",그래서 원래의 많은 건물들이 편의점으로 대체되었다.,neutral
385491,하우스보트는 영국 라지의 전성기의 아름답게 보존된 전통이다.,하우스보트의 전통은 영국 라지가 여전히 강해지는 동안 시작되었다.,entailment
385492,사망 기사는 그의 평론가의 신디케이트 TV 쇼에서 동료 검토 자 Roger Eber...,부고문은 아름다웠고 연예계에서의 그의 업적에 대해 현물로 쓰여졌다.,neutral
385493,"내가 해야 한다는 걸 알거나, 아니면 누가 하라고 하는 것보다 그녀를 밀고하는 것에...",남편이 요즘 너무 과로해서 이 근처에서 많은 일을 부탁할 용기가 나지 않는다.,neutral


In [4]:
TRAIN_SNLI_DF = pd.read_csv('snli_1.0_train.kor.tsv', sep='\t',on_bad_lines ='skip')
TRAIN_SNLI_DF.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 경쟁을 위해 말을 훈련시키고 있다.,neutral
1,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 식당에서 오믈렛을 주문하고 있다.,contradiction
2,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,사람은 야외에서 말을 타고 있다.,entailment
3,카메라에 웃고 손을 흔드는 아이들,그들은 부모님을 보고 웃고 있다,neutral
4,카메라에 웃고 손을 흔드는 아이들,아이들이 있다,entailment


In [5]:
TRAIN_XNLI_DF = pd.read_csv('xnli.dev.ko.tsv', sep='\t',on_bad_lines ='skip')
TRAIN_XNLI_DF.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,"그리고 그가 말했다, ""엄마, 저 왔어요.""",그는 학교 버스가 그를 내려주자마자 엄마에게 전화를 걸었다.,neutral
1,"그리고 그가 말했다, ""엄마, 저 왔어요.""",그는 한마디도 하지 않았다.,contradiction
2,"그리고 그가 말했다, ""엄마, 저 왔어요.""",그는 엄마에게 집에 갔다고 말했다.,entailment
3,내가 무엇을 위해 가고 있는지 또는 어떤 것을 위해 있는지 몰랐기 때문에 워싱턴의 ...,나는 워싱턴에 가본 적이 없어서 거기 배정을 받았을 때 그 장소를 찾으려다가 길을 ...,neutral
4,내가 무엇을 위해 가고 있는지 또는 어떤 것을 위해 있는지 몰랐기 때문에 워싱턴의 ...,워싱턴으로 진군하면서 해야 할 일이 무엇인지 정확히 알고 있었다.,contradiction


- 사용데이터셋 : KoNLI
- 모델 : BERT 모델 기반 자연어 추론 분류기 구축
- 자연어 문장을 주면...'entailment', 'contradiction','neutral' 중 하나로 분류
- 데이터 로딩, 전처리, 토큰화, 모델정의학습, 평가, 추론

- 데이터셋 : SNLI + XNLI 데이터를 결합하고 결측치 제거
- Bert모델에 사용할 BartTokenizer 로드
- 두 문장을 입력으로 받아서 Bert모델에 적합한 형태로 변환 작업
  - 함수구현
- 학습과 평가
- 베스트모델 로드
- 테스트데이터 평가
- 추론

In [6]:
import tensorflow as tf
import numpy as np
tf.random.set_seed(1234)
np.random.seed(1234)

# BASE PARAM

BATCH_SIZE = 32
NUM_EPOCHS = 3
MAX_LEN = 156

In [7]:
# Load Train dataset

train_data_snli = pd.read_csv('/content/snli_1.0_train.kor.tsv', header=0, delimiter = '\t', quoting = 3)
train_data_xnli = pd.read_csv('/content/multinli.train.ko.tsv', header=0, delimiter = '\t', quoting = 3)
dev_data_xnli = pd.read_csv('/content/xnli.dev.ko.tsv', header=0, delimiter = '\t', quoting = 3)

import pandas as pd

train_data_snli_xnli = pd.concat([train_data_snli,train_data_xnli])
train_data_snli_xnli = train_data_snli_xnli.dropna()
train_data_snli_xnli = train_data_snli_xnli.reset_index()

dev_data_xnli = dev_data_xnli.dropna()

print("Total # dataset: train - {}, dev - {}".format(len(train_data_snli_xnli), len(dev_data_xnli)))
# 30% 만사용
reduce_30_len = int(len(train_data_snli_xnli)*0.3)
train_data_snli_xnli = train_data_snli_xnli[:reduce_30_len]

Total # dataset: train - 942808, dev - 2490


In [8]:
# BERT 로드
from transformers import BertTokenizer, TFBartModel
# BERT 토크나이져 로드 전이학습
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',cashe_dir='bert_ckpt',do_lower_case=False)
# 두개의 문장을 토큰화하는 함수 정의
def bert_tokenizer(sent1, sent2, MAX_LEN):
  encoded_dict = tokenizer.encode_plus(
      text = sent1,         # 첫번째
      text_pair = sent2,    # 두번째
      add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
      max_length=MAX_LEN,
      padding = 'max_length',  # Pad & truncate all sentences.
      return_attention_mask = True
      )
  return encoded_dict['input_ids'], encoded_dict['attention_mask'], encoded_dict['token_type_ids']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



In [9]:
input_ids = []
attention_masks = []
token_type_ids = []


from tqdm import tqdm
for sent1, sent2 in tqdm(zip(train_data_snli_xnli['sentence1'], train_data_snli_xnli['sentence2']),
                         total = len(train_data_snli_xnli) ):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(sent1, sent2, MAX_LEN)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
    except Exception as e:
        print(e)
        print(sent1, sent2)
        pass


100%|██████████| 282842/282842 [01:55<00:00, 2444.49it/s]


In [10]:
train_snli_xnli_input_ids = np.array(input_ids, dtype=int)
train_snli_xnli_attention_masks = np.array(attention_masks, dtype=int)
train_snli_xnli_type_ids = np.array(token_type_ids, dtype=int)
train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)

In [11]:
print(train_snli_xnli_input_ids[0][:30])
print(train_snli_xnli_attention_masks[0][:30])
print(train_snli_xnli_type_ids[0][:30])
print(tokenizer.decode(train_snli_xnli_input_ids[0][:30]) )

[   101   9251  10622   9847  97802   8888  13890  33305   9379  25549
  12310   9619  11261   9150  12965  28188  66346    119    102   9954
  97802   8885 119202  10622  19905   9251  10622  10004 101440  14040]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1]
[CLS] 말을 탄 사람이 고장난 비행기 위로 뛰어오른다. [SEP] 한 사람이 경쟁을 위해 말을 훈련시


In [12]:
# 두개의 데이터셋을 Bert의 입력으로 토큰화...
# 훈련데이터처리, 검증데이터 처리

# 훈련데이터 처리  dev_data_xnli

input_ids = []
attention_masks = []
token_type_ids = []


from tqdm import tqdm
for sent1, sent2 in tqdm(zip(dev_data_xnli['sentence1'], dev_data_xnli['sentence2']),
                         total = len(dev_data_xnli) ):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(sent1, sent2, MAX_LEN)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
    except Exception as e:
        print(e)
        print(sent1, sent2)
        pass

dev_xnli_input_ids = np.array(input_ids, dtype=int)
dev_xnli_attention_masks = np.array(attention_masks, dtype=int)
dev_xnli_type_ids = np.array(token_type_ids, dtype=int)
dev_xnli_inputs = (dev_xnli_input_ids, dev_xnli_attention_masks, dev_xnli_type_ids)

100%|██████████| 2490/2490 [00:01<00:00, 1515.83it/s]


In [13]:
# 레이블을 숫자로 변환하는 함수
label_deict = dict(zip(train_data_snli['gold_label'].unique(),[2,1,0]))
def convert_int(label):
  return label_deict[label]

In [14]:
# 데이터 준비
train_data_snli_xnli['gold_label_int'] = train_data_snli_xnli['gold_label'].apply(lambda x : convert_int(x))
train_data_labels = np.array( train_data_snli_xnli['gold_label_int'], dtype=int )

dev_data_xnli['gold_label_int'] = dev_data_xnli['gold_label'].apply(lambda x : convert_int(x))
dev_data_labels = np.array( dev_data_xnli['gold_label_int'], dtype=int )

In [15]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()
        self.bert = TFBartModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                                name="classifier")

    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        return logits

In [16]:
cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',
                                  dir_path='bert_ckpt',
                                  num_class=3)

optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

You are using a model of type bert to instantiate a model of type bart. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartModel: ['bert.encoder.layer.10.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.intermediate.dense.weight', 'bert.encoder.layer.7.attention.self.query.weight', 'bert.encoder.layer.7.attention.self.value.weight', 'bert.encoder.layer.7.intermediate.dense.bias', 'bert.encoder.layer.3.attention.self.key.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.11.attention.self.query.weight', 'bert.encoder.layer.6.attention.self.value.weight', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.6.output.LayerNorm.weight', 'bert.encoder.layer.8.attention.self.key.bias', 'bert.encoder.layer.2.attention.self.key.bias', 'bert.encoder.layer.3.attention.self.value.bias', 'bert.encoder.layer.4.attention.self.value.bias', 'bert.encoder.layer.6.intermediate.dense.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.bi

In [17]:
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    'model2.keras.h5', monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

In [18]:
np.array(train_snli_xnli_inputs).shape, np.array(train_data_labels).shape

((3, 282842, 156), (282842,))

In [19]:
# 메모리 최적화
import gc
del train_snli_xnli_input_ids
del train_snli_xnli_attention_masks
del train_snli_xnli_type_ids
gc.collect()

0

In [20]:
history = cls_model.fit(train_snli_xnli_inputs, train_data_labels, epochs=NUM_EPOCHS,
            validation_data = (dev_xnli_inputs, dev_data_labels),
            batch_size=8, callbacks=[earlystop_callback, cp_callback])

Epoch 1/3




InvalidArgumentError: Graph execution error:

Detected at node Equal defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 377, in dispatch_queue

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 250, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 748, in __init__

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-20-e89641dbd8a2>", line 1, in <cell line: 1>

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1804, in fit

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1398, in train_function

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1381, in step_function

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1370, in run_step

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1152, in train_step

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1246, in compute_metrics

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/compile_utils.py", line 620, in update_state

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/utils/metrics_utils.py", line 77, in decorated

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/metrics/base_metric.py", line 140, in update_state_fn

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/metrics/base_metric.py", line 722, in update_state

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/utils/metrics_utils.py", line 968, in sparse_categorical_matches

required broadcastable shapes
	 [[{{node Equal}}]] [Op:__inference_train_function_87360]