## 24-1. 프로젝트 : 커스텀 프로젝트 직접 만들기

In [1]:
import os   #디렉토리 관리
import tensorflow as tf   #신경망

### STEP 1. NSMC 데이터 분석 및 Huggingface dataset 구성

In [2]:
from datasets import load_dataset
dataset = load_dataset('nsmc')

dataset

Found cached dataset nsmc (/aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [3]:
train = dataset['train']
cols = train.column_names
cols

for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')

id : 9976970
document : 아 더빙.. 진짜 짜증나네요 목소리
label : 0


id : 3819312
document : 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
label : 1


id : 10265843
document : 너무재밓었다그래서보는것을추천한다
label : 0


id : 9045019
document : 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
label : 0


id : 6483659
document : 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
label : 1




### STEP 2. klue/bert-base model 및 tokenizer 불러오기

In [4]:
from transformers import BertForSequenceClassification, DataCollatorWithPadding, AutoTokenizer
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
model = BertForSequenceClassification.from_pretrained("klue/bert-base",num_labels=2)
#model = AutoModel.from_pretrained("klue/bert-base")

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

### STEP 3. 위에서 불러온 tokenizer으로 데이터셋을 전처리하고, model 학습 진행해 보기

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [6]:
def transform(data):
    return tokenizer(
        data['document'],
#         data['sentence2'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,
        )

In [7]:
hf_dataset = dataset.map(transform, batched=True)

hf_train_dataset = hf_dataset['train']
hf_test_dataset = hf_dataset['test']
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-29691924ccbd296a.arrow


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [8]:
hf_train_dataset

Dataset({
    features: ['id', 'document', 'label', 'input_ids', 'attention_mask'],
    num_rows: 150000
})

In [9]:
hf_train_dataset = hf_train_dataset.remove_columns(["document", "id"])
hf_test_dataset = hf_test_dataset.remove_columns(["document", "id"])

#hf_train_dataset = hf_train_dataset.remove_columns(["document", "id", 'labels'])
#hf_test_dataset = hf_test_dataset.remove_columns(["document", "id", 'labels'])

In [12]:
hf_train_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 150000
})

In [13]:
tv = hf_train_dataset.train_test_split(test_size=0.2)
train_dataset = tv['train']
validation_dataset = tv['test']

### Trainer를 활용한 학습

In [14]:
import os
import numpy as np
from transformers import Trainer, TrainingArguments

#output_dir = './transformers/nsmc/'
output_dir = os.getenv('HOME')+'/aiffel/transformers'

training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    evaluation_strategy="steps",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 16,   # 각 device 당 batch size
    per_device_eval_batch_size = 16,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
#     weight_decay = 0.01,                        # weight decay
#     label_names=["label"],
    fp16=True,
    group_by_length =True
)

In [15]:
from datasets import load_metric
#metric = load_metric('glue', 'mrpc')
metric = load_metric('accuracy')

  metric = load_metric('accuracy')


In [16]:
def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

In [17]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [18]:
train_dataset[1]

{'label': 0,
 'input_ids': [2,
  24627,
  19813,
  16,
  24627,
  1504,
  2201,
  5667,
  2015,
  18,
  18,
  18,
  18,
  18,
  18,
  19628,
  2336,
  2154,
  2470,
  766,
  2646,
  4015,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [None]:
trainer = Trainer(
    model=model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=train_dataset, #hf_train_dataset,    # training dataset
    eval_dataset=validation_dataset,#hf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
500,0.3753,0.318553,0.8656
1000,0.3294,0.297102,0.875733
1500,0.3147,0.293332,0.8809
2000,0.2889,0.319091,0.8802
2500,0.282,0.278523,0.882367
3000,0.2894,0.273012,0.8875
3500,0.2897,0.293429,0.888433
4000,0.2718,0.268664,0.8928
4500,0.278,0.273219,0.894333
5000,0.2631,0.270113,0.894633


In [None]:
trainer.evaluate(hf_test_dataset)