In [1]:
# 카카오에서 공개한 한국어 벤치마크 데이터셋
# 문장이 두개.. 두개의 문장의 관계를 3개의 클래스로 구분
# 수반(entailment) 중립(netual) 모순(contradiction)
import urllib
# 훈련 데이터 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/multinli.train.ko.tsv", filename="multinli.train.ko.tsv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/snli_1.0_train.ko.tsv", filename="snli_1.0_train.ko.tsv")

# 검증 데이터 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.dev.ko.tsv", filename="xnli.dev.ko.tsv")

# 테스트 데이터 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.test.ko.tsv", filename="xnli.test.ko.tsv")

('xnli.test.ko.tsv', <http.client.HTTPMessage at 0x2402f723040>)

In [2]:
# 데이터셋 클래스
# 모델클래스 : GPT2 로드하고 마지막층에 Linear층 --> 분류기 추가.
# 손실함수... 이진 BinaryCrossEntropy, 다중 CrossEntropy

In [3]:
# 데이터 확인
import pandas as pd
train_multi = pd.read_csv('multinli.train.ko.tsv',sep='\t',quoting=3)
train_snli = pd.read_csv('snli_1.0_train.ko.tsv',sep='\t',quoting=3)
val_data = pd.read_csv('xnli.dev.ko.tsv',sep='\t',quoting=3)
test_data = pd.read_csv('xnli.test.ko.tsv',sep='\t',quoting=3)

In [4]:
train_data = pd.concat([train_multi,train_snli]).reset_index(drop=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 942854 entries, 0 to 942853
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   sentence1   942854 non-null  object
 1   sentence2   942808 non-null  object
 2   gold_label  942854 non-null  object
dtypes: object(3)
memory usage: 21.6+ MB


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [6]:
class KorNLIDataset(Dataset):
  def __init__(self,sentence1, sentence2,labels,tokenizer,max_length) -> None:
    self.sentence1 = sentence1
    self.sentence2 = sentence2
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length
  def __len__(self):
    return len(self.sentence1)

  def __getitem__(self, index):
    # 각 문장에 대해서 토큰나이저
    sent1 = self.sentence1[index]
    sent2 = self.sentence2[index]
    inputs =  self.tokenizer(
        text = sent1,
        text_pair = sent2,
        truncation = True,
        padding = 'max_length',
        max_length = self.max_length,
        return_tensors = 'pt'
    )
    input_ids = inputs['input_ids'].squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0)
    label = torch.tensor(self.labels[index])
    return input_ids, attention_mask, label

# 데이터셋 테스트를 위한 셈플코드

In [7]:
# tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2',
#                                           bos_token='</s>', eos_token='</s>', pad_token='<pad>')
# sent1, sent2 = train_data['sentence1'].to_list(),train_data['sentence2'].to_list()
# inputs =tokenizer(
#         text = sent1[0],
#         text_pair = sent2[0],
#         truncation = True,
#         padding = 'max_length',
#         max_length = 127,
#         return_tensors = 'pt'
# )
# print(f'key : {inputs.keys()}')
# print(f"input_ids shape : { inputs['input_ids'].squeeze(0).shape , inputs['attention_mask'].squeeze(0).shape}")

# 데이터셋 테스트

In [8]:
# tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2',
#                                            bos_token='</s>', eos_token='</s>', pad_token='<pad>')
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# encoded_label = le.fit_transform(train_data['gold_label'])
# print(le.classes_, le.transform(le.classes_))

# train_dataset = KorNLIDataset(train_data['sentence1'].to_list()
#   ,train_data['sentence2'].to_list()
#   ,encoded_label
#   ,tokenizer
#   ,127)

# input_ids, attention_mask, label = next(iter(train_dataset))
# print(f'input_ids shape : {input_ids.shape}')
# print(f'attention_mask shape : {attention_mask.shape}')
# print(f'label : {label}')

In [9]:
train_data['gold_label']

0               neutral
1            entailment
2            entailment
3            entailment
4               neutral
              ...      
942849    contradiction
942850          neutral
942851          neutral
942852    contradiction
942853       entailment
Name: gold_label, Length: 942854, dtype: object

# 모델정의(클래스)

In [10]:
from transformers import GPT2Model
class GPT2ForSeqClassification(torch.nn.Module):
  def __init__(self, num_labels):
    super(GPT2ForSeqClassification, self).__init__()
    self.num_labels = num_labels
    self.gpt = GPT2Model.from_pretrained('skt/kogpt2-base-v2')  # 128*6
    # 분류기를 통과
    self.classifier = torch.nn.Linear(768, self.num_labels)

  def forward(self, input_ids, attention_mask):
    outputs =  self.gpt(input_ids=input_ids, attention_mask=attention_mask)
    cls_output =  outputs.last_hidden_state[:,-1,:]  # 입력 텍스트의 요약본
    logits = self.classifier(cls_output)
    return logits

In [11]:
from sklearn.preprocessing import LabelEncoder
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2',
                                           bos_token='</s>', eos_token='</s>', pad_token='<pad>')
le = LabelEncoder()
y_train = le.fit_transform(train_data['gold_label'])
y_val= le.transform(val_data['gold_label'])
y_test= le.transform(test_data['gold_label'])
print(le.classes_, le.transform(le.classes_))

train_dataset = KorNLIDataset(train_data['sentence1'].to_list()
  ,train_data['sentence2'].to_list()
  ,y_train
  ,tokenizer
  ,127
)
val_dataset = KorNLIDataset(val_data['sentence1'].to_list()
  ,val_data['sentence2'].to_list()
  ,y_val
  ,tokenizer
  ,127
)
test_dataset = KorNLIDataset(test_data['sentence1'].to_list()
  ,test_data['sentence2'].to_list()
  ,y_test
  ,tokenizer
  ,127
)
# 로더
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['contradiction' 'entailment' 'neutral'] [0 1 2]


# 모델 초기화

In [12]:
import torch_directml
device = torch_directml.device() # torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = GPT2ForSeqClassification(3)
model.to(device)

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


GPT2ForSeqClassification(
  (gpt): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (classifier): Linear(in_features=768, out_features=3, bias=True)
)

# 옵티마이저 및 손실함수

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 학습 루프

In [14]:
from tqdm import tqdm
for epoch in range(2):
  iterator = tqdm(train_loader)
  for input_ids, attention_mask, label in iterator:
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    label = label.to(device)

    outputs = model(input_ids, attention_mask) # 예측
    loss = loss_fn(outputs, label)    # 손실값
    loss.backward() # 기울기 구함
    optimizer.step() # 기울기 업데이트
    optimizer.zero_grad()

    iterator.set_description(f'Epoch {epoch}')
    iterator.set_postfix_str(f'Loss: {loss.item():.4f}')

  0%|          | 0/29465 [00:00<?, ?it/s]


RuntimeError: The GPU device does not support Double (Float64) operations!

In [1]:
import torch
import torch_directml
dml = torch_directml.device()

ImportError: Failed to load PyTorch C extensions:
    It appears that PyTorch has loaded the `torch/_C` folder
    of the PyTorch repository rather than the C extensions which
    are expected in the `torch._C` namespace. This can occur when
    using the `install` workflow. e.g.
        $ python setup.py install && python -c "import torch"

    This error can generally be solved using the `develop` workflow
        $ python setup.py develop && python -c "import torch"  # This should succeed
    or by running Python from a different directory.

In [3]:
torch.__version__

NameError: name 'torch' is not defined

In [None]:
tensor1 = torch.tensor([1]).to(dml) # Note that dml is a variable, not a string!
tensor2 = torch.tensor([2]).to(dml)

In [None]:
dml_algebra = tensor1 + tensor2
dml_algebra.item()

In [None]:
import torch
tensor1 = torch.tensor([1]).to(dml)
tensor2 = torch.tensor([2]).to(dml)
dml_algebra = tensor1 + tensor2
dml_algebra.item()