In [None]:
!pip install transformers
!pip install datasets

In [None]:
!nvidia-smi

In [4]:
import pandas as pd
import numpy as np
import os
from glob import glob
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [5]:
seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
PLM = 'klue/roberta-large'

cpu


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/AI/Dacon/KLUE_NLI/input/train_data.csv', index_col='index')
test_df = pd.read_csv('/content/drive/MyDrive/AI/Dacon/KLUE_NLI/input/test_data.csv', index_col='index')

# EDA

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info() # 24,998개의 데이터, null 값은 X\
print('-'*48)
test_df.info()  # 1,666개의 데이터

In [None]:
train_df.describe()

## Label의 분포 확인

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18,8))
train_df['label'].value_counts().plot.pie(explode = [0.05, 0.05, 0.05], autopct="%.2f%%", ax=ax[0], shadow=True)
ax[0].set_title('Label')
ax[0].set_ylabel('')
sns.countplot('label', data=train_df, ax=ax[1])
ax[1].set_title('Label')
plt.show()

## Text의 길이 분포 확인

In [None]:
train_df['pre_length'] = train_df.premise.str.len()
train_df['hypo_length'] = train_df.hypothesis.str.len()
train_df['total_length'] =train_df['pre_length'] + train_df['hypo_length']
train_df.head()

In [None]:
test_df['pre_length'] = test_df.premise.str.len()
test_df['hypo_length'] = test_df.hypothesis.str.len()
test_df['total_length'] =test_df['pre_length'] + test_df['hypo_length']
test_df.head()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

### Label별 Train data 분포

In [None]:
grid = sns.FacetGrid(train_df, col='label', size=5, aspect=1.6)
grid.map(plt.hist, 'pre_length', alpha=.5, bins=25)
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train_df, col='label', size=5, aspect=1.6)
grid.map(plt.hist, 'hypo_length', alpha=.5, bins=25)
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train_df, col='label', size=5, aspect=1.6)
grid.map(plt.hist, 'total_length', alpha=.5, bins=25)
grid.add_legend()

In [None]:
train_df = train_df.drop(['pre_length', 'hypo_length', 'total_length'], axis=1)
test_df = test_df.drop(['pre_length', 'hypo_length', 'total_length'], axis=1)
train_df.shape, test_df.shape

# 학습 데이터 구성

In [None]:
label2id = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
id2label = {v: k for k, v in label2id.items()}
id2label

In [None]:
train_df['label'] = train_df['label'].map(label2id)

In [1]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    EarlyStoppingCallback
)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(PLM)
dynamic_padding = DataCollatorWithPadding(tokenizer = tokenizer)

AttributeError: 'BertTokenizerFast' object has no attribute 'special_tokens'

In [None]:
# from sklearn.model_selection import StratifiedShuffleSplit

# shuffle = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

# for train_idx, valid_idx in shuffle.split(train_df, train_df['label']):
#     train_dataset = train_df.loc[train_idx]
#     valid_dataset = train_df.loc[valid_idx]
train_dataset = train_df

In [None]:
from datasets import Dataset

test_df = test_df.drop(['label'], axis=1)

train_dataset = Dataset.from_pandas(train_dataset)
# valid_dataset = Dataset.from_pandas(valid_dataset)
test_dataset = Dataset.from_pandas(test_df)
print(train_dataset)
# print(valid_dataset)
test_dataset

In [None]:
tokenized_train_dataset = train_dataset.map(lambda x : tokenizer(x['premise'], x['hypothesis'], truncation=True, return_token_type_ids=False), batched=True)
# tokenized_valid_dataset = valid_dataset.map(lambda x : tokenizer(x['premise'], x['hypothesis'], truncation=True, return_token_type_ids=False), batched=True)
tokenized_test_dataset = test_dataset.map(lambda x : tokenizer(x['premise'], x['hypothesis'], truncation=True, return_token_type_ids=False), batched=True)

In [None]:
tokenized_train_dataset.set_format(type='torch', columns=[ 'input_ids', 'attention_mask', 'label'])
# tokenized_valid_dataset.set_format(type='torch', columns=[ 'input_ids', 'attention_mask', 'label'])
tokenized_test_dataset.set_format(type='torch', columns=[ 'input_ids', 'attention_mask'])

In [None]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['premise', 'hypothesis', 'index'])
# tokenized_valid_dataset = tokenized_valid_dataset.remove_columns(['premise', 'hypothesis', 'index'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['premise', 'hypothesis', 'index'])
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
# tokenized_valid_dataset = tokenized_valid_dataset.rename_column("label", "labels")
# tokenized_train_dataset, tokenized_valid_dataset, tokenized_test_dataset

# 모델 학습

In [None]:
EPOCH = 5
TRAIN_BATCH = 16
VALID_BATCH = 32
LEARNING_RATE = 3e-5
eval_flag = False

In [None]:
training_args = TrainingArguments(
        output_dir="./results",  # output directory
        overwrite_output_dir = True,
        save_total_limit=3,  # number of total save model.
        save_strategy="epoch",
        num_train_epochs=EPOCH,  # total number of training epochs
        learning_rate=LEARNING_RATE,  # learning_rate
        per_device_train_batch_size = TRAIN_BATCH,  # batch size per device during training
        per_device_eval_batch_size = VALID_BATCH,  # batch size for evaluation
        warmup_ratio = 0.1,
        evaluation_strategy="epoch" if eval_flag else "no",
        load_best_model_at_end=True if eval_flag else False,
        metric_for_best_model='accuracy'
    )

In [None]:
from datasets import load_metric
xnli_metric = load_metric('xnli')

def compute_metrics(EvalPrediction):
    preds, labels = EvalPrediction
    preds = np.argmax(preds, axis=1)

    return xnli_metric.compute(predictions = preds, references = labels)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(PLM, num_labels=3)

In [None]:

trainer = Trainer(
    # the instantiated 🤗 Transformers model to be trained
    model=model,
    args=training_args,  # training arguments, defined above
    train_dataset=tokenized_train_dataset,  # training dataset
    eval_dataset=tokenized_valid_dataset if eval_flag else None,
    compute_metrics=compute_metrics,  # define metrics function
    data_collator=dynamic_padding,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)] if eval_flag else None
)

In [None]:
trainer.train()

# 추론

In [None]:
outputs = trainer.predict(tokenized_test_dataset)

In [None]:
submission = pd.DataFrame({'index':test_df.index, 'label' : outputs[0].argmax(axis=1)})
submission['label'] = submission['label'].map(id2label)
submission.to_csv('/content/drive/MyDrive/AI/Dacon/KLUE_NLI/output/submission_all_data.csv', index=False)

In [35]:
sen = tokenizer('안녕하세요')
sen

{'input_ids': [0, 5891, 2205, 5971, 2], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [40]:
tokenizer(sentence)

{'input_ids': [[0, 5891, 2], [0, 4899, 2]], 'token_type_ids': [[0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}

In [38]:
sentence = ['안녕', '하이']