# 데이콘 - 한국어 문장 관계 분류 경진대회

0. 데이터 로드

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 35.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 38.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from sklearn import preprocessing
from transformers import AutoTokenizer, TFGPT2Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
train = pd.read_csv('/content/train_data.csv')
test = pd.read_csv('/content/test_data.csv')
submission = pd.read_csv('/content/sample_submission.csv')

In [None]:
train.head()

Unnamed: 0,index,premise,hypothesis,label
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,contradiction
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,contradiction
2,2,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.,예측적 범죄예방 시스템 구축하고 고도화하는 것은 목적이 있기 때문이다.,entailment
3,3,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,원주민들은 종합대책에 만족했다.,neutral
4,4,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",이런 상황에서 책임 있는 모습을 보여주는 기업은 아주 드물다.,neutral


1. 전처리

In [None]:
train.isnull().sum()

index         0
premise       0
hypothesis    0
label         0
dtype: int64

In [None]:
train['label'].value_counts()

entailment       8561
contradiction    8489
neutral          7948
Name: label, dtype: int64

In [None]:
train['premise'].map(len).max()

90

In [None]:
train['hypothesis'].map(len).max()

103

In [None]:
max_seq_len = 100

In [None]:
len(train)

24998

In [None]:
valid = train[20000:]
train = train[:20000]

3. BERT-base 0.78

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import urllib.request
from sklearn import preprocessing
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, AutoTokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
def convert_examples_to_features(sent_list1, sent_list2, max_seq_len, tokenizer):

    input_ids, attention_masks, token_type_ids = [], [], []

    for sent1, sent2 in tqdm(zip(sent_list1, sent_list2), total=len(sent_list1)):
        encoding_result = tokenizer.encode_plus(sent1, sent2, max_length=max_seq_len, pad_to_max_length=True)

        input_ids.append(encoding_result['input_ids'])
        attention_masks.append(encoding_result['attention_mask'])
        token_type_ids.append(encoding_result['token_type_ids'])

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    return (input_ids, attention_masks, token_type_ids)

In [None]:
X_train = convert_examples_to_features(train['premise'], train['hypothesis'], max_seq_len=max_seq_len, tokenizer=tokenizer)

  0%|          | 0/20000 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 20000/20000 [00:05<00:00, 3955.21it/s]


In [None]:
X_valid = convert_examples_to_features(valid['premise'], valid['hypothesis'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 4998/4998 [00:01<00:00, 4105.11it/s]


In [None]:
input_id = X_train[0]

print('단어에 대한 정수 인코딩 :',input_id)
print('각 인코딩의 길이 :', len(input_id))
print('정수 인코딩 복원 :',tokenizer.decode(input_id[0]))

단어에 대한 정수 인코딩 : [[    0 14441  2073 ...     1     1     1]
 [    0  3840  2073 ...     1     1     1]
 [    0  8345  3627 ...     1     1     1]
 ...
 [    0  4081  2170 ...     1     1     1]
 [    0  3660  2299 ...     1     1     1]
 [    0  3957 12095 ...     1     1     1]]
각 인코딩의 길이 : 20000
정수 인코딩 복원 : [CLS] 씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이 넓고 평평한 백사장이나 마당에서 모여 서로 힘과 슬기를 겨루는 것이다. [SEP] 씨름의 여자들의 놀이이다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [None]:
X_test = convert_examples_to_features(test['premise'], test['hypothesis'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 1666/1666 [00:00<00:00, 4177.48it/s]


In [None]:
train_label = train['label'].tolist()
valid_label = valid['label'].tolist()

In [None]:
idx_encode = preprocessing.LabelEncoder()
idx_encode.fit(train_label)

y_train = idx_encode.transform(train_label)
y_valid = idx_encode.transform(valid_label)

label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
print(label_idx)

{'contradiction': 0, 'entailment': 1, 'neutral': 2}


In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.initializers import TruncatedNormal
from transformers import TFAutoModel

class TFBertForSequenceClassification(Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = TFAutoModel.from_pretrained(model_name, 
                                                num_labels=3, 
                                                from_pt=True)
        self.classifier = Dense(3,
                                kernel_initializer=TruncatedNormal(0.02),
                                activation='softmax')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids=inputs
        outputs = self.bert(input_ids=input_ids, 
                            attention_mask=attention_mask, 
                            token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [None]:
model = TFBertForSequenceClassification("klue/roberta-base")
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics = 'accuracy')

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.decoder.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream 

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor="val_accuracy", 
    min_delta=0.001,
    patience=2)

In [None]:
model.fit(X_train, y_train, epochs=4, batch_size=32, validation_data=(X_valid, y_valid),
    callbacks = [early_stopping])

Epoch 1/4
Epoch 2/4
Epoch 3/4


<keras.callbacks.History at 0x7f350dd2d3d0>

In [None]:
results = model.predict(X_test)

In [None]:
preds = [np.argmax(val) for val in results]

In [None]:
out = [list(label_idx.keys())[i] for i in preds]
out[:3]

['contradiction', 'neutral', 'neutral']

In [None]:
results = pd.DataFrame({"index": test['index'], "label": out})
results.head()

Unnamed: 0,index,label
0,0,contradiction
1,1,neutral
2,2,neutral
3,3,neutral
4,4,entailment


In [None]:
results.to_csv("submission.csv", index=False)