In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 쇼핑몰 리뷰 평점 분류 경진대회

In [None]:
pip install pytorch

In [None]:
pip install folium==0.2.1

In [None]:
pip install datasets

In [None]:
pip install transformers

In [None]:
import numpy as np
import pandas as pd
import os
import tqdm

import torch
#from torch.utils.data import datasets
from torchvision import datasets
from torch.utils.data import Dataset

from glob import glob
from tqdm import tqdm

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

## 1. 모델 설정

In [None]:
MODEL_NAME = "klue/roberta-large" # "klue/bert-base", "klue/bert-large", "klue/roberta-base"
batch_size = 32
learning_rate = 5e-5
num_epochs = 1

## 2. 데이터셋 로드

In [None]:
raw_train = load_dataset('csv', data_files='./drive/MyDrive/input/쇼핑몰 리뷰 평점 분류/train.csv')
raw_test = load_dataset('csv', data_files='./drive/MyDrive/input/쇼핑몰 리뷰 평점 분류/test.csv')

Using custom data configuration default-4e0e825c9ba6867a


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4e0e825c9ba6867a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4e0e825c9ba6867a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-1355a0a68bf9aefb


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1355a0a68bf9aefb/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1355a0a68bf9aefb/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import datasets
train, valid = raw_train['train'].train_test_split(test_size=0.1).values()
review_dataset = datasets.DatasetDict({'train': train, 'valid': valid, 'test': raw_test['train']})
review_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'reviews', 'target'],
        num_rows: 22500
    })
    valid: Dataset({
        features: ['id', 'reviews', 'target'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['id', 'reviews'],
        num_rows: 25000
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(tokenizer.tokenize(train['reviews'][0]))

def tokenize_function(example):
    return tokenizer(example["reviews"], truncation=True)

tokenized_datasets = review_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

['생각', '##보', '##다', '너무', '약', '##함', '처음', '루', '##어', '##대', '##를', '샀', '##는데', '가격', '##이', '싼', '##건', '이유', '##가', '있', '##음', '.', '낚시', '##대', '탄성', '##이', '거의', '없', '##어', '##서', '그냥', '부서', '##집니다', '.', '안', '##사', '##는', '##걸', '추천', '.']


  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'reviews', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22500
    })
    valid: Dataset({
        features: ['id', 'reviews', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['id', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["id", "reviews"])
tokenized_datasets['train'] = tokenized_datasets['train'].rename_column("target", "labels")
tokenized_datasets['valid'] = tokenized_datasets['valid'].rename_column("target", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
valid_dataloader = DataLoader(tokenized_datasets["valid"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["test"], shuffle=False, batch_size=batch_size, collate_fn=data_collator)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([32, 63]),
 'input_ids': torch.Size([32, 63]),
 'labels': torch.Size([32]),
 'token_type_ids': torch.Size([32, 63])}

## 3. 모델 로드

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6) # 편의상 6으로 설정

Downloading config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'cl

In [None]:
from transformers import get_scheduler, AdamW
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

704




In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

## 4. 모델 학습

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.save_pretrained(f"./result/{MODEL_NAME}/{epoch}")
    tokenizer.save_pretrained(f"./result/{MODEL_NAME}/{epoch}")

  0%|          | 0/704 [00:00<?, ?it/s]

## 5. 모델 검증

In [None]:
# 저장된 모델을 불러와서 사용할때
#save_path = './result/klue/roberta-large/1'
#model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=6).to(device)

In [None]:
pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.9.3-py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 4.2 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.9.3


In [None]:
from torchmetrics import Accuracy
accuracy = Accuracy()

prediction_list_valid = []
target_list_valid = []

model.eval()
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu()
    targets = batch['labels'].cpu()

    prediction_list_valid.extend(predictions)
    target_list_valid.extend(targets)
    #print(accuracy(predictions, targets)) # 매 batch 마다의 Accuracy

print(f'valid acc: {accuracy(torch.IntTensor(prediction_list_valid), torch.IntTensor(target_list_valid)).cpu().tolist():.4f}')

valid acc: 0.7032


## 6. 모델 Evaluation 및 Submission 파일 생성

In [None]:
prediction_list = []
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_list.extend(predictions.cpu().tolist())

In [None]:

# 제출용 파일 불러오기
submission = pd.read_csv("./drive/MyDrive/input/쇼핑몰 리뷰 평점 분류/sample_submission.csv") 
print(submission.head())

# 예측 값 넣어주기
submission["target"] = prediction_list
print(submission.head())

   id  target
0   0       0
1   1       0
2   2       0
3   3       0
4   4       0
   id  target
0   0       2
1   1       1
2   2       5
3   3       1
4   4       1


In [None]:
submission.to_csv("roberta_large_1epoch.csv",index=False)