
# Pytorch + HuggingFace 
## KoElectra Model
박장원님의 KoElectra-small 사용<br>
https://monologg.kr/2020/05/02/koelectra-part1/<br>
https://github.com/monologg/KoELECTRA

## Dataset
네이버 영화 리뷰 데이터셋<br>
https://github.com/e9t/nsmc

## References
- https://huggingface.co/transformers/training.html
- https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html
- https://tutorials.pytorch.kr/beginner/blitz/cifar10_tutorial.html
- https://wikidocs.net/44249

## 주의사항
꼭 GPU로 해주세요 - 1epoch 당 약 20분 소요

In [None]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 6.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 38.0MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 36.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=f7baa

In [None]:

import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
import gc
from sklearn.model_selection import KFold
import numpy as np
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('./drive/MyDrive/4_party_project')

In [None]:
# GPU 사용
device = torch.device("cuda")

# Dataset 만들어서 불러오기 

In [None]:
#naver movie, Hotel, shopping 
class NHSDataset(Dataset): 
  
  def __init__(self, csv_file):
    # NaN값 제거...
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    # 중복제거
    self.dataset.drop_duplicates(subset=['Text'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y, text

In [None]:
#train데이터 test데이터 NHSDataset class에 넣어줌
train_dataset = NHSDataset('train.txt')
test_dataset = NHSDataset('test.txt')

              Label      len_text
count  79171.000000  79171.000000
mean       0.505286      7.710904
std        0.499975      7.083011
min        0.000000      2.000000
25%        0.000000      4.000000
50%        1.000000      6.000000
75%        1.000000      9.000000
max        1.000000    272.000000


# Create Model

In [None]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)

# 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

# Model 저장되있는거 돌려보기

In [None]:

# model.load_state_dict(torch.load("원하는모델.pt"))
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))


<All keys matched successfully>

In [None]:
# 모델 레이어 보기
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

# Learn

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
epochs = 3
batch_size = 64

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=1238.0), HTML(value='')))



Batch Loss: 68.04444950819016 Accuracy: tensor(0.6166, device='cuda:0')
Batch Loss: 121.99107959866524 Accuracy: tensor(0.7417, device='cuda:0')
Batch Loss: 161.02170032262802 Accuracy: tensor(0.7917, device='cuda:0')
Batch Loss: 192.06309816241264 Accuracy: tensor(0.8208, device='cuda:0')
Batch Loss: 220.01130594313145 Accuracy: tensor(0.8386, device='cuda:0')
Batch Loss: 246.99482993781567 Accuracy: tensor(0.8502, device='cuda:0')
Batch Loss: 271.4609081298113 Accuracy: tensor(0.8600, device='cuda:0')
Batch Loss: 296.4556524157524 Accuracy: tensor(0.8665, device='cuda:0')
Batch Loss: 320.52793546020985 Accuracy: tensor(0.8719, device='cuda:0')
Batch Loss: 343.85167184472084 Accuracy: tensor(0.8761, device='cuda:0')
Batch Loss: 366.7053650841117 Accuracy: tensor(0.8800, device='cuda:0')
Batch Loss: 388.9286892451346 Accuracy: tensor(0.8837, device='cuda:0')

Train Loss: 397.42890245094895 Accuracy: tensor(0.8847, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1238.0), HTML(value='')))

Batch Loss: 21.361451275646687 Accuracy: tensor(0.9245, device='cuda:0')
Batch Loss: 42.3336443528533 Accuracy: tensor(0.9248, device='cuda:0')
Batch Loss: 63.0704293474555 Accuracy: tensor(0.9249, device='cuda:0')
Batch Loss: 83.1251098960638 Accuracy: tensor(0.9264, device='cuda:0')
Batch Loss: 103.39676908403635 Accuracy: tensor(0.9272, device='cuda:0')
Batch Loss: 123.6457470394671 Accuracy: tensor(0.9271, device='cuda:0')
Batch Loss: 143.67531633749604 Accuracy: tensor(0.9274, device='cuda:0')
Batch Loss: 164.34339333698153 Accuracy: tensor(0.9271, device='cuda:0')
Batch Loss: 184.84964944794774 Accuracy: tensor(0.9273, device='cuda:0')
Batch Loss: 205.8334901072085 Accuracy: tensor(0.9269, device='cuda:0')
Batch Loss: 225.96552058681846 Accuracy: tensor(0.9271, device='cuda:0')
Batch Loss: 245.39845236763358 Accuracy: tensor(0.9270, device='cuda:0')

Train Loss: 252.95645774528384 Accuracy: tensor(0.9271, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1238.0), HTML(value='')))

Batch Loss: 17.844703748822212 Accuracy: tensor(0.9372, device='cuda:0')
Batch Loss: 36.90201371908188 Accuracy: tensor(0.9362, device='cuda:0')
Batch Loss: 55.745037730783224 Accuracy: tensor(0.9356, device='cuda:0')
Batch Loss: 74.72837808355689 Accuracy: tensor(0.9346, device='cuda:0')
Batch Loss: 93.69651899859309 Accuracy: tensor(0.9338, device='cuda:0')
Batch Loss: 112.30201808363199 Accuracy: tensor(0.9333, device='cuda:0')
Batch Loss: 130.68957451730967 Accuracy: tensor(0.9336, device='cuda:0')
Batch Loss: 149.46112121641636 Accuracy: tensor(0.9335, device='cuda:0')
Batch Loss: 167.79560147598386 Accuracy: tensor(0.9337, device='cuda:0')
Batch Loss: 186.35213939473033 Accuracy: tensor(0.9338, device='cuda:0')
Batch Loss: 204.35872882977128 Accuracy: tensor(0.9341, device='cuda:0')
Batch Loss: 224.69901215657592 Accuracy: tensor(0.9337, device='cuda:0')

Train Loss: 231.33616605773568 Accuracy: tensor(0.9338, device='cuda:0')


# 테스트 데이터셋 정확도및 강한(긍정,부정) 확률 확인


In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch , original_text in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    for idx in range(len(y_pred)):
      print(f'기존문장: {original_text[idx]}')
      pred_tensor = F.sigmoid(y_pred[idx])
      print(f'긍정확률={round(float(pred_tensor[0]*100),2)}%, 부정확률={round(float(pred_tensor[1]*100),2)}%')
      print("")
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=1285.0), HTML(value='')))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
제가 묵었던방은 욕실이 없었지만 그외는 좋았네요
tensor([0.4711, 0.5214], device='cuda:0', grad_fn=<SigmoidBackward>)
깨끗하고 가성비 좋아요
tensor([0.1037, 0.8958], device='cuda:0', grad_fn=<SigmoidBackward>)
4성인데 4성 느낌이 아니에여ㅠ
tensor([0.9533, 0.0414], device='cuda:0', grad_fn=<SigmoidBackward>)
화장실 문이 없다는 점, 신발장이나 신발 놔두는 부분이 애매하다는 점이 흠이라면 흠인데 저는 사실 별 상관은 없었어요
tensor([0.7912, 0.2050], device='cuda:0', grad_fn=<SigmoidBackward>)
옆방이랑 방음이 안되서 새벽5시까지 떠드는소리에 잠을못잠 아..
tensor([0.9343, 0.0585], device='cuda:0', grad_fn=<SigmoidBackward>)
어머니가 맛있게 드셔서 만족스러웠습니다.
tensor([0.0806, 0.9180], device='cuda:0', grad_fn=<SigmoidBackward>)
가성비 갑이구요
tensor([0.0478, 0.9512], device='cuda:0', grad_fn=<SigmoidBackward>)
침구는 너무너무 안락하고 좋았습니다
tensor([0.1597, 0.8356], device='cuda:0', grad_fn=<SigmoidBackward>)
역 근처에 주변 맛집도 많아서 넘 좋아요
tensor([0.0455, 0.9548], device='cuda:0', grad_fn=<SigmoidBackward>)
침대가 굉장히 푹신합니다.
tensor([0.0410, 0.9595], device='cuda:0', grad_fn=<SigmoidBackwa

In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "model.pt")