# Dev-Matching challenge에 사용한 Multi-classification Bert

이 문제는 hashcode의 질문 데이터를 활용하여 5가지 언어로 분류하는 작업으로 text-multi-class classification 문제에 해당합니다. text 데이터는 시계열 데이터이므로 기존의 text 정보를 모두 내포해야 합니다.  
이에 주로 Recurrent 모델을 사용하게 되지만, Recurrent 모델은 보다 장기간의 text들을 모두 기억할 수 없었습니다(기울기소실).  
그래서 미리 학습된 데이터를 불러와서 활용하는 방안인 bert모델을 선택하게 되었습니다. bert모델은 pre-trained된 데이터를 사용하기 때문에 기울기 소실에 대한 문제가 없고 방대한 데이터를 사용하기 때문에 범용적입니다.  
추가적으로 pre-trained데이터는 ETRI에서 제공하는 어절단위로 학습된 tensorflow 데이터를 사용했습니다.   
링크: http://aiopen.etri.re.kr/service_dataset.php

In [0]:
import numpy as np 
import pandas as pd
import re
import gc
import os
import fileinput
import string
import tensorflow as tf
import zipfile
import datetime
import sys

In [0]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, roc_auc_score



In [1]:
# optimizer,tokenization 등의 py파일을 불러옴
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/optimization.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py 
# !wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py 


--2020-02-06 03:51:53--  https://raw.githubusercontent.com/google-research/bert/master/modeling.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 37922 (37K) [text/plain]
Saving to: ‘modeling.py’


2020-02-06 03:51:53 (35.6 MB/s) - ‘modeling.py’ saved [37922/37922]

--2020-02-06 03:52:00--  https://raw.githubusercontent.com/google-research/bert/master/optimization.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6258 (6.1K) [text/plain]
Saving to: ‘optimization.py’


2020-02-06 03:52:01 (113 MB/s) - ‘optimization.py’ saved [6258/6258]

-

In [5]:
!wget https://raw.githubusercontent.com/dbsgh3344/dev_matching/master/tokenization.py

--2020-02-06 03:58:36--  https://raw.githubusercontent.com/dbsgh3344/dev_matching/master/tokenization.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12861 (13K) [text/plain]
Saving to: ‘tokenization.py’


2020-02-06 03:58:36 (155 MB/s) - ‘tokenization.py’ saved [12861/12861]



In [6]:
import tokenization

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from sklearn.metrics import classification_report

import modeling
import optimization
import run_classifier


Using TensorFlow backend.





In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

--2020-02-05 08:47:39--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.128, 2607:f8b0:400e:c07::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-02-05 08:47:42 (152 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]



In [8]:
# output파일을 보관할 폴더 지정
folder = 'model_folder'
OUTPUT_DIR = f'{folder}/outputs'
print(f'>> Model output directory: {OUTPUT_DIR}')
BERT_MODEL = 'uncased_L-12_H-768_A-12'

>> Model output directory: model_folder/outputs


# 데이터불러오기

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# train데이터 불러오기 
traindf = pd.read_csv('/content/drive/My Drive/제목없는 폴더/dev_matching/train.csv') 
traindf = traindf.dropna()
print(traindf.shape)
traindf.tail()

(2591, 3)


Unnamed: 0,title,content,label
2587,$$$ 슬라이드 질문입니다.,안녕하세요.\n다름이아니라 \n$$$로 버튼을 클릭하면 \n오른쪽에서 왼쪽으로 들어...,4
2588,$$$ 메인 클래스,프로그램 전체 메인 클래스를 하나 만들고 메인에서 메뉴를 선택하면 세가지 각각 다른...,3
2589,$$$ sparse matrix 질문합니다!,* sparse matrix: 0이 아닌 값을 가진 원소들을 ordered list...,1
2590,$$$ models 폴더의 depth가 일반적이지 않을 경우 migration하는 법,안녕하세요 $$$ + javascript를 이용해서 간단한 웹게임을 만들어보려고 하...,5
2591,허프만 코드를 사용한 파일 압축 질문입니다.,허프만 코딩을 사용해서 파일 압축을 하는 과제 도중에 \n입력받은 파일 데이터를 이...,2


In [0]:
# 텍스트와 레이블 column 분리해서 x_train과 y_train에 지정
traindf = traindf.dropna() # nan값이 있는 row 삭제
X_train = traindf['content']

encoders = LabelEncoder()
y_train = encoders.fit_transform(traindf['label']) #기존에 레이블 1~5까지를 0~4까지로 변경

In [0]:
X_train

0            ###발생하는 문제 및 실행환경\nwait()과 sleep()의 차이점은 뭔가요
1       1.헤더에, 사용할 멤버변수가 담긴 헤더 파일이 Include 되어있습니다.\n예를...
2       안녕하세요.\n\n어떻게 다대다 관계 쿼리를 해야하나요? 예를들어, `product...
3       ```\nparseFloat('geoff') == NaN;\n\nparseFloat...
4       ```\n>>>dict['name']\n胡安·马塔\n>>>json.dumps(dic...
                              ...                        
2587    안녕하세요.\n다름이아니라 \n$$$로 버튼을 클릭하면 \n오른쪽에서 왼쪽으로 들어...
2588    프로그램 전체 메인 클래스를 하나 만들고 메인에서 메뉴를 선택하면 세가지 각각 다른...
2589    * sparse matrix: 0이 아닌 값을 가진 원소들을 ordered list...
2590    안녕하세요 $$$ + javascript를 이용해서 간단한 웹게임을 만들어보려고 하...
2591    허프만 코딩을 사용해서 파일 압축을 하는 과제 도중에 \n입력받은 파일 데이터를 이...
Name: content, Length: 2591, dtype: object

In [0]:
# 가장 긴 문자열의 길이를 확인
max([len(i) for i in X_train])

28756

In [0]:
X_train.shape,y_train.shape

((2591,), (2591,))

In [0]:
# test데이터 불러오기
X_test = pd.read_csv('/content/drive/My Drive/제목없는 폴더/dev_matching/test.csv')
X_test = X_test['content']

print(X_test.shape)
X_test.head()

(500,)


0    $$$의 경우는 \n\n```\ndef init_spin_array(rows, co...
1    코드를 실행시키면 $("#search").autocomplete  라인에 \n\n ...
2    ```\nint arr[]= { 1, 2, 4, 8, 16 };\nint* ptr=...
3    \n35634646이라는 숫자가 있을때 이걸 String으로 변환할때 35,634,...
4    ```\nimport sys\nfrom $$$.QtWidgets import QAp...
Name: content, dtype: object

In [0]:
max([len(i) for i in X_test])

35210

# 모델학습

In [0]:
def create_examples(lines, set_type, labels=None):
#Generate data for the BERT model
    guid = f'{set_type}'
    examples = []
    if guid == 'train':
        for line, label in zip(lines, labels):
            text_a = line
            label = str(label)
            """Creating single training/test example for simple sequence classification."""
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    else:
        for line in lines:
            text_a = line
            label = '0'
            examples.append(run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

In [0]:
# Model Hyper Parameters
bert_pretrained_folder = '/content/drive/My Drive/금융문자/모델링/model_KorBERT(ETRI)/4_bert_download_004_bert_eojeol_tensorflow/004_bert_eojeol_tensorflow/'
# bert_pretrained_folder = '/content/drive/My Drive/금융문자/모델링/model_KorBERT(ETRI)/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 1e-5
NUM_TRAIN_EPOCHS = 6
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 200
# Model configs
SAVE_CHECKPOINTS_STEPS = 100000 #if you wish to finetune a model on a larger dataset, use larger interval
# each checpoint weights about 1,5gb
ITERATIONS_PER_LOOP = 100000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(bert_pretrained_folder, 'vocab.korean.rawtext.list') # etri에서 제공하는 vocab
CONFIG_FILE = os.path.join(bert_pretrained_folder, 'bert_config.json') # etri에서 제공하는 config
INIT_CHECKPOINT = os.path.join(bert_pretrained_folder, 'model.ckpt-56000') # etri에서 제공하는 check point
# INIT_CHECKPOINT = os.path.join('/content/drive/My Drive/', 'model.ckpt-1295')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')


In [0]:
label_list = [str(num) for num in range(traindf['label'].nunique())]
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)


"""Creating input in the form of Text sequence classification"""
train_examples = create_examples(X_train, 'train', labels=y_train)

tpu_cluster_resolver = None #Since training will happen on GPU, we won't need a cluster resolver
#TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.

"""Making all running configurations"""
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
"""this much of total iterations will be called"""
num_train_steps = int(len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)

"""Why to use this"""
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)


In [0]:

model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False, #If False training will fall on CPU or GPU, depending on what is available  
    use_one_hot_embeddings=True)


estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False, #If False training will fall on CPU or GPU, depending on what is available 
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)

INFO:tensorflow:Using config: {'_model_dir': 'model_folder/outputs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdc52565e80>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100000, num_shards=8, num_cores_per_repli

In [0]:

'''Since the model is pretrained why do we need to train it again'''
print('Please wait...')
"""preparing training data """
train_features = run_classifier.convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('>> Started training at {} '.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)
"""Here run classifier the the classifier from the bert which will get trained and will be responsible for prediction as well"""
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('>> Finished training at {}'.format(datetime.datetime.now()))


Please wait...

INFO:tensorflow:Writing example 0 of 2591
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train
INFO:tensorflow:tokens: [CLS] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ᄎ ᅡ ᄋ ᅵ ᄌ ᅥ ᆷ ᄋ ᅳ ᆫ _ [UNK] [SEP]
INFO:tensorflow:input_ids: 2 1 1 1 1 1 1 23205 22470 23109 23158 24193 23118 23162 23109 23807 16007 9 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

# predicting

In [0]:

def input_fn_builder(features, seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""
  """This is used to make the proper format of the prediction variable"""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

  def input_fn(params):
    """The actual input function."""
    print(params)
    batch_size = 500

    num_examples = len(features)

    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn

In [0]:
X_test

0      $$$의 경우는 \n\n```\ndef init_spin_array(rows, co...
1      코드를 실행시키면 $("#search").autocomplete  라인에 \n\n ...
2      ```\nint arr[]= { 1, 2, 4, 8, 16 };\nint* ptr=...
3      \n35634646이라는 숫자가 있을때 이걸 String으로 변환할때 35,634,...
4      ```\nimport sys\nfrom $$$.QtWidgets import QAp...
                             ...                        
495    \n**String date_s = "2011-01-18 00:00:00.0";**...
496    while(1){\n\n\t\t\t\tprintf("숫자를 입력해주세요 ");\n\...
497    66을 입력하면 루프를 빠져나와 프로그램 종료하고싶은데 자꾸 디버그 에러가 나네요;...
498    \n 다음 코드에서 최종적으로 uni.i가 갖는 값은 어떻게 될까? 16진수로 답해...
499    ![이미지][1]\n\n\n  [1]: https://res.cloudinary.c...
Name: content, Length: 500, dtype: object

In [0]:
predict_examples = create_examples(X_test, 'test')

predict_features = run_classifier.convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer)


INFO:tensorflow:Writing example 0 of 500
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test
INFO:tensorflow:tokens: [CLS] [UNK] ᄀ ᅧ ᆼ ᄋ ᅮ ᄂ ᅳ ᆫ _ ` ` ` _ d e f_ in it_ s p in_ ar ra y ( ro w s ,_ col s ) :_ ret u r n _ n p . ran d om . ch o ic e ( ( - 1 ,_ 1 ) ,_ s i z e = ( ro w s ,_ col s ) )_ ` ` ` _ ᄋ ᅵ ᄅ ᅥ ᆫ _ ᄉ ᅵ ᆨ ᄋ ᅳ ᄅ ᅩ _ ᄒ ᅡ ᄆ ᅧ ᆫ _ [UNK] n * n _ [UNK] [UNK] ᄉ ᅥ ᆼ ᄇ ᅮ ᆫ ᄋ ᅵ _ [UNK] - 1_ ᄃ ᅮ ᆯ _ ᄌ ᅮ ᆼ _ ᄒ ᅡ ᄂ ᅡ ᄅ ᅳ ᆯ _ [UNK] [UNK] ᄀ ᅥ ᆺ ᄋ ᅳ ᄅ ᅩ _ ᄆ ᅡ ᆫ ᄃ ᅳ ᆯ _ ᄉ ᅮ _ [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ᄀ ᅡ ᆫ ᄃ ᅡ ᆫ ᄒ ᅵ _ [UNK] [UNK] $ $ $ ᄂ ᅳ ᆫ _ ᄎ ᅩ ᄇ ᅩ ᄅ ᅡ ᄉ ᅥ _ ᄀ ᅳ ᄅ ᅥ ᆫ ᄌ ᅵ _ [UNK] ᄎ ᅥ ᆺ _ [UNK] [SEP]
INFO:tensorflow:input_ids: 2 1 18504 26870 25658 23109 27366 16534 23807 16007 9 737 737 737 9 396 239 4469 805 6102 344 477 3539 1159 3733 726 15 3359 894 344 8 5303 344 19 267 6930 390 789 472 9 472 477 42 4533 396 3171 42 2464 424 1865 239 15 15 121 52 8 52 19 8 344 356 1804 239 1001 15 3359 894 344 8 5303 344 19 66 737 737 737 9 23109 23158 20666 23118 16007 9 22203

In [0]:
predict_input_fn = input_fn_builder(
    features=predict_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

result = estimator.predict(input_fn=predict_input_fn)

In [0]:

preds = []
for prediction in result:
      preds.append(np.argmax(prediction['probabilities']))

{}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (?, 200)
INFO:tensorflow:  name = input_mask, shape = (?, 200)
INFO:tensorflow:  name = label_ids, shape = (?,)
INFO:tensorflow:  name = segment_ids, shape = (?, 200)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30797, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encode

In [0]:
preds[:10]

[4, 3, 0, 2, 4, 2, 4, 3, 2, 2]

In [0]:
y_pred = encoders.inverse_transform(preds)

#파일 제출

In [0]:
submissions = pd.read_csv('/content/drive/My Drive/제목없는 폴더/dev_matching/sample.csv')

In [0]:
submissions['label'] = y_pred

In [0]:
submissions

Unnamed: 0,label
0,5
1,4
2,1
3,3
4,5
...,...
495,3
496,1
497,1
498,4


In [0]:
# label 분포
submissions['label'].value_counts()

3    149
5    107
4     93
2     78
1     73
Name: label, dtype: int64

In [0]:
submissions.to_csv('/content/drive/My Drive/제목없는 폴더/dev_matching/submission/bert_koberttokenizer_eojeol___len200_testlen200_epoch6.csv',index=False)