## 참고 코드: https://dacon.io/competitions/official/235938/codeshare/5739?page=1&dtype=recent

In [28]:
import os
import tqdm
import pandas as pd
import numpy as np
import torch
import datasets

from glob import glob
from tqdm import tqdm

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [29]:
model_name = 'monologg/distilkobert'
batch_size = 64
lr = 5e-5
num_epochs = 1

In [30]:
raw_train = load_dataset('csv', data_files='./dataset/train.csv')
raw_test = load_dataset('csv', data_files='./dataset/test.csv')

Using custom data configuration default-00e8b1b18e8e71d9
Reusing dataset csv (/Users/hoon/.cache/huggingface/datasets/csv/default-00e8b1b18e8e71d9/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-59e6a60816dac326
Reusing dataset csv (/Users/hoon/.cache/huggingface/datasets/csv/default-59e6a60816dac326/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
raw_test

DatasetDict({
    train: Dataset({
        features: ['id', 'reviews'],
        num_rows: 25000
    })
})

In [32]:
train, valid = raw_train['train'].train_test_split(test_size=0.1).values()
review_dataset = datasets.DatasetDict({'train':train, 'valid':valid, 'test':raw_test['train']})
review_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'reviews', 'target'],
        num_rows: 22500
    })
    valid: Dataset({
        features: ['id', 'reviews', 'target'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['id', 'reviews'],
        num_rows: 25000
    })
})

In [35]:
train['reviews'][0]

'포장이 허술하여 100개중15개는 구겨져서왔네요ㆍ반품하려ᆢ당분간 상황이 여의치않아ᆢ넘어가지만ᆢ배송에 신경써주시기바랍니다ㆍ'

In [36]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.tokenize(train['reviews'][0]))

['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]']


In [37]:
def tokenizer_function(example):
    return tokenizer(example['reviews'], truncation=True)

tokenized_dataset = review_dataset.map(tokenizer_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [38]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'reviews', 'target', 'input_ids', 'attention_mask'],
        num_rows: 22500
    })
    valid: Dataset({
        features: ['id', 'reviews', 'target', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['id', 'reviews', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [39]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='monologg/distilkobert', vocab_size=8002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [40]:
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'reviews'])
tokenized_dataset['train'] = tokenized_dataset['train'].rename_column('target', 'labels')
tokenized_dataset['valid'] = tokenized_dataset['valid'].rename_column('target', 'labels')
tokenized_dataset.set_format('torch')
tokenized_dataset['train'].column_names

['labels', 'input_ids', 'attention_mask']

In [41]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 22500
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [42]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
valid_dataloader = DataLoader(tokenized_dataset['valid'], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset['test'], shuffle=True, batch_size=batch_size, collate_fn=data_collator)

In [43]:
for batch in train_dataloader:
    break

{k:v.shape for k, v in batch.items()}

{'labels': torch.Size([64]),
 'input_ids': torch.Size([64, 37]),
 'attention_mask': torch.Size([64, 37])}

In [44]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

Downloading:   0%|          | 0.00/108M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/distilkobert were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at monologg/distilkobert and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bia

In [45]:
from transformers import get_scheduler, AdamW
optimizer = AdamW(model.parameters(), lr=lr)

num_training_steps = num_epochs * len(train_dataloader)
lr_schduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

352




In [46]:
# mps: macbook 자체 gpu사용

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cpu')

### model training

In [47]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

# set model to training mode
model.train()
for epochs in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_schduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    model.save_pretrained(f'./result/{model_name}/{epochs}')
    tokenizer.save_pretrained(f'./result/{model_name}/{epochs}')

  0%|          | 0/352 [00:00<?, ?it/s]

In [48]:
from torchmetrics import Accuracy
accuracy = Accuracy()

pred_list_valid = []
target_list_valid = []

model.eval()
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    pred = torch.argmax(logits, dim=-1).cpu()
    targets = batch['labels'].cpu()
    
    pred_list_valid.extend(pred)
    target_list_valid.extend(targets)
    
    print(accuracy(pred, targets))
    
print(f'valid ACC: {accuracy(torch.IntTensor(pred_list_valid), torch.IntTensor(target_list_valid)).cpu().tolist():.4f}')

tensor(0.4531)
tensor(0.5625)
tensor(0.5312)
tensor(0.4062)
tensor(0.3906)
tensor(0.5312)
tensor(0.5781)
tensor(0.4375)
tensor(0.4531)
tensor(0.4688)
tensor(0.4844)
tensor(0.4688)
tensor(0.3438)
tensor(0.5781)
tensor(0.5156)
tensor(0.4844)
tensor(0.4531)
tensor(0.5312)
tensor(0.5781)
tensor(0.4531)
tensor(0.4688)
tensor(0.5000)
tensor(0.4375)
tensor(0.4375)
tensor(0.5625)
tensor(0.4531)
tensor(0.4688)
tensor(0.5312)
tensor(0.4688)
tensor(0.5000)
tensor(0.5156)
tensor(0.5312)
tensor(0.5469)
tensor(0.5781)
tensor(0.5000)
tensor(0.6562)
tensor(0.5312)
tensor(0.4375)
tensor(0.5156)
tensor(0.2500)
valid ACC: 0.4956


In [49]:
pred_list = []
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    pred = torch.argmax(logits, dim=-1)
    pred_list.extend(pred.cpu().tolist())

In [50]:
pred_list

[2,
 5,
 5,
 5,
 2,
 2,
 2,
 5,
 5,
 5,
 2,
 5,
 5,
 2,
 2,
 5,
 2,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 2,
 2,
 2,
 5,
 2,
 2,
 5,
 5,
 5,
 2,
 5,
 2,
 2,
 5,
 2,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 5,
 2,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 2,
 2,
 5,
 5,
 2,
 5,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 5,
 5,
 5,
 5,
 2,
 2,
 2,
 5,
 2,
 5,
 5,
 2,
 5,
 2,
 5,
 5,
 2,
 5,
 2,
 5,
 2,
 2,
 5,
 2,
 5,
 2,
 5,
 5,
 2,
 2,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 2,
 2,
 5,
 5,
 5,
 5,
 5,
 2,
 2,
 2,
 2,
 2,
 2,
 5,
 5,
 2,
 5,
 5,
 2,
 2,
 5,
 2,
 5,
 2,
 5,
 5,
 2,
 5,
 2,
 2,
 2,
 5,
 2,
 2,
 2,
 2,
 2,
 5,
 5,
 5,
 5,
 2,
 2,
 5,
 2,
 5,
 2,
 5,
 5,
 5,
 2,
 5,
 2,
 5,
 2,
 2,
 5,
 5,
 2,
 5,
 2,
 2,
 5,
 5,
 5,
 2,
 2,
 5,
 2,
 5,
 5,
 5,
 5,
 2,
 2,
 5,
 5,
 2,
 5,
 2,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 2,
 5,
 5,
 5,
 2,
 5,
 5,
 2,
 5,
 2,
 5,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 2,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 5,
 5,
 5,
 5,


In [55]:
submission = pd.read_csv('./dataset/sample_submission.csv')
submission['target'] = pred_list